Commit Message:
- Added examples for Amazon product data extraction methods - Updated configuration options and enhance documentation - Minor refactoring for improved performance and readability - Cleaned up version control settings.
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -208,7 +208,7 @@ git_issues.md
|
||||
|
||||
.next/
|
||||
.tests/
|
||||
.issues/
|
||||
# .issues/
|
||||
.docs/
|
||||
.issues/
|
||||
.gitboss/
|
||||
@@ -218,4 +218,5 @@ manage-collab.sh
|
||||
publish.sh
|
||||
combine.sh
|
||||
combined_output.txt
|
||||
tree.md
|
||||
tree.md
|
||||
.scripts
|
||||
@@ -11,6 +11,7 @@ from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from typing import Union, List
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
@@ -39,8 +40,8 @@ class BrowserConfig:
|
||||
Default: None.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1920.
|
||||
viewport_height (int): Default viewport height for pages. Default: 1080.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
verbose (bool): Enable verbose logging.
|
||||
Default: True.
|
||||
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
||||
@@ -79,7 +80,7 @@ class BrowserConfig:
|
||||
chrome_channel: str = "chrome",
|
||||
proxy: str = None,
|
||||
proxy_config: dict = None,
|
||||
viewport_width: int = 800,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: str = None,
|
||||
@@ -136,10 +137,15 @@ class BrowserConfig:
|
||||
self.debugging_port = debugging_port
|
||||
|
||||
user_agenr_generator = UserAgentGenerator()
|
||||
if self.user_agent_mode != "random":
|
||||
if self.user_agent_mode != "random" and self.user_agent_generator_config:
|
||||
self.user_agent = user_agenr_generator.generate(
|
||||
**(self.user_agent_generator_config or {})
|
||||
)
|
||||
elif self.user_agent_mode == "random":
|
||||
self.user_agent = user_agenr_generator.generate()
|
||||
else:
|
||||
pass
|
||||
|
||||
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
|
||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||
|
||||
@@ -158,8 +164,8 @@ class BrowserConfig:
|
||||
chrome_channel=kwargs.get("chrome_channel", "chrome"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
viewport_width=kwargs.get("viewport_width", 1920),
|
||||
viewport_height=kwargs.get("viewport_height", 1080),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
downloads_path=kwargs.get("downloads_path"),
|
||||
storage_state=kwargs.get("storage_state"),
|
||||
@@ -215,6 +221,8 @@ class CrawlerRunConfig:
|
||||
Default: False.
|
||||
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
||||
Default: False.
|
||||
parser_type (str): Type of parser to use for HTML parsing.
|
||||
Default: "lxml".
|
||||
|
||||
# Caching Parameters
|
||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||
@@ -322,6 +330,7 @@ class CrawlerRunConfig:
|
||||
keep_data_attributes: bool = False,
|
||||
remove_forms: bool = False,
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
@@ -345,7 +354,7 @@ class CrawlerRunConfig:
|
||||
semaphore_count: int = 5,
|
||||
|
||||
# Page Interaction Parameters
|
||||
js_code=None,
|
||||
js_code: Union[str, List[str]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
@@ -393,6 +402,7 @@ class CrawlerRunConfig:
|
||||
self.keep_data_attributes = keep_data_attributes
|
||||
self.remove_forms = remove_forms
|
||||
self.prettiify = prettiify
|
||||
self.parser_type = parser_type
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@@ -478,6 +488,7 @@ class CrawlerRunConfig:
|
||||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||||
remove_forms=kwargs.get("remove_forms", False),
|
||||
prettiify=kwargs.get("prettiify", False),
|
||||
parser_type=kwargs.get("parser_type", "lxml"),
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
@@ -550,6 +561,7 @@ class CrawlerRunConfig:
|
||||
"keep_data_attributes": self.keep_data_attributes,
|
||||
"remove_forms": self.remove_forms,
|
||||
"prettiify": self.prettiify,
|
||||
"parser_type": self.parser_type,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
from .models import CrawlResult
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
import xxhash
|
||||
import aiofiles
|
||||
from .config import NEED_MIGRATION
|
||||
@@ -295,13 +295,18 @@ class AsyncDatabaseManager:
|
||||
row_dict[field] = ""
|
||||
|
||||
# Parse JSON fields
|
||||
json_fields = ['media', 'links', 'metadata', 'response_headers']
|
||||
json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
|
||||
for field in json_fields:
|
||||
try:
|
||||
row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
|
||||
except json.JSONDecodeError:
|
||||
row_dict[field] = {}
|
||||
|
||||
if isinstance(row_dict['markdown'], Dict):
|
||||
row_dict['markdown_v2'] = row_dict['markdown']
|
||||
if row_dict['markdown'].get('raw_markdown'):
|
||||
row_dict['markdown'] = row_dict['markdown']['raw_markdown']
|
||||
|
||||
# Parse downloaded_files
|
||||
try:
|
||||
row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
|
||||
@@ -331,10 +336,28 @@ class AsyncDatabaseManager:
|
||||
content_map = {
|
||||
'html': (result.html, 'html'),
|
||||
'cleaned_html': (result.cleaned_html or "", 'cleaned'),
|
||||
'markdown': (result.markdown or "", 'markdown'),
|
||||
'markdown': None,
|
||||
'extracted_content': (result.extracted_content or "", 'extracted'),
|
||||
'screenshot': (result.screenshot or "", 'screenshots')
|
||||
}
|
||||
|
||||
try:
|
||||
if isinstance(result.markdown, MarkdownGenerationResult):
|
||||
content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
|
||||
elif hasattr(result, 'markdown_v2'):
|
||||
content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
|
||||
elif isinstance(result.markdown, str):
|
||||
markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
|
||||
content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
|
||||
else:
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Error processing markdown content: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
# Fallback to empty markdown result
|
||||
content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
|
||||
|
||||
content_hashes = {}
|
||||
for field, (content, content_type) in content_map.items():
|
||||
|
||||
@@ -69,6 +69,24 @@ class AsyncWebCrawler:
|
||||
New way (recommended):
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
|
||||
Attributes:
|
||||
browser_config (BrowserConfig): Configuration object for browser settings.
|
||||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||
always_bypass_cache (bool): Whether to always bypass cache.
|
||||
crawl4ai_folder (str): Directory for storing cache.
|
||||
base_directory (str): Base directory for storing cache.
|
||||
ready (bool): Whether the crawler is ready for use.
|
||||
|
||||
Methods:
|
||||
start(): Start the crawler explicitly without using context manager.
|
||||
close(): Close the crawler explicitly without using context manager.
|
||||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||||
awarmup(): Perform warmup sequence.
|
||||
arun_many(): Run the crawler for multiple sources.
|
||||
aprocess_html(): Process HTML content.
|
||||
"""
|
||||
_domain_last_hit = {}
|
||||
|
||||
@@ -321,7 +339,7 @@ class AsyncWebCrawler:
|
||||
|
||||
# Initialize processing variables
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached_result = None
|
||||
cached_result: CrawlResult = None
|
||||
screenshot_data = None
|
||||
pdf_data = None
|
||||
extracted_content = None
|
||||
@@ -373,52 +391,89 @@ class AsyncWebCrawler:
|
||||
tag="FETCH"
|
||||
)
|
||||
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html = True if url.startswith("raw:") else False,
|
||||
**kwargs
|
||||
)
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html = True if url.startswith("raw:") else False,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
# crawl_result.status_code = async_response.status_code
|
||||
# crawl_result.response_headers = async_response.response_headers
|
||||
# crawl_result.downloaded_files = async_response.downloaded_files
|
||||
# crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||
# else:
|
||||
# crawl_result.status_code = 200
|
||||
# crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
# crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
|
||||
|
||||
# # Check and set values from async_response to crawl_result
|
||||
try:
|
||||
for key in vars(async_response):
|
||||
if hasattr(crawl_result, key):
|
||||
value = getattr(async_response, key, None)
|
||||
current_value = getattr(crawl_result, key, None)
|
||||
if value is not None and not current_value:
|
||||
try:
|
||||
setattr(crawl_result, key, value)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Failed to set attribute {key}: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
message=f"Error copying response attributes: {str(e)}",
|
||||
tag="WARNING"
|
||||
)
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
|
||||
# Set response data
|
||||
if async_response:
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.ssl_certificate = async_response.ssl_certificate # Add SSL certificate
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None # Add SSL certificate from cache
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": True,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = getattr(config, 'session_id', None)
|
||||
|
||||
self.logger.success(
|
||||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||||
tag="COMPLETE",
|
||||
params={
|
||||
"url": cache_context.display_url,
|
||||
"status": crawl_result.success,
|
||||
"timing": f"{time.perf_counter() - start_time:.2f}s"
|
||||
},
|
||||
colors={
|
||||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||||
"timing": Fore.YELLOW
|
||||
}
|
||||
)
|
||||
|
||||
# Update cache if appropriate
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, 'session_id', None)
|
||||
return cached_result
|
||||
|
||||
except Exception as e:
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
@@ -465,6 +520,7 @@ class AsyncWebCrawler:
|
||||
extracted_content: Previously extracted content (if any)
|
||||
config: Configuration object controlling processing behavior
|
||||
screenshot: Screenshot data (if any)
|
||||
pdf_data: PDF data (if any)
|
||||
verbose: Whether to enable verbose logging
|
||||
**kwargs: Additional parameters for backwards compatibility
|
||||
|
||||
|
||||
@@ -25,8 +25,26 @@ class CacheContext:
|
||||
|
||||
This class centralizes all cache-related logic and URL type checking,
|
||||
making the caching behavior more predictable and maintainable.
|
||||
|
||||
Attributes:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
is_cacheable (bool): True if the URL is cacheable, False otherwise.
|
||||
is_web_url (bool): True if the URL is a web URL, False otherwise.
|
||||
is_local_file (bool): True if the URL is a local file, False otherwise.
|
||||
is_raw_html (bool): True if the URL is raw HTML, False otherwise.
|
||||
_url_display (str): The display name for the URL (web, local file, or raw HTML).
|
||||
"""
|
||||
def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
|
||||
"""
|
||||
Initializes the CacheContext with the provided URL and cache mode.
|
||||
|
||||
Args:
|
||||
url (str): The URL being processed.
|
||||
cache_mode (CacheMode): The cache mode for the current operation.
|
||||
always_bypass (bool): If True, bypasses caching for this operation.
|
||||
"""
|
||||
self.url = url
|
||||
self.cache_mode = cache_mode
|
||||
self.always_bypass = always_bypass
|
||||
@@ -37,13 +55,31 @@ class CacheContext:
|
||||
self._url_display = url if not self.is_raw_html else "Raw HTML"
|
||||
|
||||
def should_read(self) -> bool:
|
||||
"""Determines if cache should be read based on context."""
|
||||
"""
|
||||
Determines if cache should be read based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be read, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""Determines if cache should be written based on context."""
|
||||
"""
|
||||
Determines if cache should be written based on context.
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be written, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@@ -7,22 +7,43 @@ from .utils import *
|
||||
|
||||
# Define the abstract base class for chunking strategies
|
||||
class ChunkingStrategy(ABC):
|
||||
"""
|
||||
Abstract base class for chunking strategies.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def chunk(self, text: str) -> list:
|
||||
"""
|
||||
Abstract method to chunk the given text.
|
||||
|
||||
Args:
|
||||
text (str): The text to chunk.
|
||||
|
||||
Returns:
|
||||
list: A list of chunks.
|
||||
"""
|
||||
pass
|
||||
|
||||
# Create an identity chunking strategy f(x) = [x]
|
||||
class IdentityChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that returns the input text as a single chunk.
|
||||
"""
|
||||
def chunk(self, text: str) -> list:
|
||||
return [text]
|
||||
|
||||
# Regex-based chunking
|
||||
class RegexChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text based on regular expression patterns.
|
||||
"""
|
||||
def __init__(self, patterns=None, **kwargs):
|
||||
"""
|
||||
Initialize the RegexChunking object.
|
||||
|
||||
Args:
|
||||
patterns (list): A list of regular expression patterns to split text.
|
||||
"""
|
||||
if patterns is None:
|
||||
patterns = [r'\n\n'] # Default split pattern
|
||||
self.patterns = patterns
|
||||
@@ -38,9 +59,15 @@ class RegexChunking(ChunkingStrategy):
|
||||
|
||||
# NLP-based sentence chunking
|
||||
class NlpSentenceChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
|
||||
"""
|
||||
def __init__(self, **kwargs):
|
||||
"""
|
||||
Initialize the NlpSentenceChunking object.
|
||||
"""
|
||||
load_nltk_punkt()
|
||||
pass
|
||||
|
||||
|
||||
def chunk(self, text: str) -> list:
|
||||
# Improved regex for sentence splitting
|
||||
@@ -57,8 +84,21 @@ class NlpSentenceChunking(ChunkingStrategy):
|
||||
|
||||
# Topic-based segmentation using TextTiling
|
||||
class TopicSegmentationChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
|
||||
|
||||
How it works:
|
||||
1. Segment the text into topics using TextTilingTokenizer
|
||||
2. Extract keywords for each topic segment
|
||||
"""
|
||||
|
||||
def __init__(self, num_keywords=3, **kwargs):
|
||||
"""
|
||||
Initialize the TopicSegmentationChunking object.
|
||||
|
||||
Args:
|
||||
num_keywords (int): The number of keywords to extract for each topic segment.
|
||||
"""
|
||||
import nltk as nl
|
||||
self.tokenizer = nl.tokenize.TextTilingTokenizer()
|
||||
self.num_keywords = num_keywords
|
||||
@@ -88,6 +128,14 @@ class TopicSegmentationChunking(ChunkingStrategy):
|
||||
|
||||
# Fixed-length word chunks
|
||||
class FixedLengthWordChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into fixed-length word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, chunk_size=100, **kwargs):
|
||||
"""
|
||||
Initialize the fixed-length word chunking strategy with the given chunk size.
|
||||
@@ -103,6 +151,14 @@ class FixedLengthWordChunking(ChunkingStrategy):
|
||||
|
||||
# Sliding window chunking
|
||||
class SlidingWindowChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words
|
||||
2. Create chunks of fixed length
|
||||
3. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=100, step=50, **kwargs):
|
||||
"""
|
||||
Initialize the sliding window chunking strategy with the given window size and
|
||||
@@ -133,6 +189,15 @@ class SlidingWindowChunking(ChunkingStrategy):
|
||||
return chunks
|
||||
|
||||
class OverlappingWindowChunking(ChunkingStrategy):
|
||||
"""
|
||||
Chunking strategy that splits text into overlapping word chunks.
|
||||
|
||||
How it works:
|
||||
1. Split the text into words using whitespace
|
||||
2. Create chunks of fixed length equal to the window size
|
||||
3. Slide the window by the overlap size
|
||||
4. Return the list of chunks
|
||||
"""
|
||||
def __init__(self, window_size=1000, overlap=100, **kwargs):
|
||||
"""
|
||||
Initialize the overlapping window chunking strategy with the given window size and
|
||||
|
||||
@@ -9,17 +9,8 @@ from .utils import clean_tokens
|
||||
from abc import ABC, abstractmethod
|
||||
import math
|
||||
from snowballstemmer import stemmer
|
||||
|
||||
|
||||
# import regex
|
||||
# def tokenize_text(text):
|
||||
# # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters
|
||||
# pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]'
|
||||
# return regex.findall(pattern, text)
|
||||
|
||||
# from nltk.stem import PorterStemmer
|
||||
# ps = PorterStemmer()
|
||||
class RelevantContentFilter(ABC):
|
||||
"""Abstract base class for content filtering strategies"""
|
||||
def __init__(self, user_query: str = None):
|
||||
self.user_query = user_query
|
||||
self.included_tags = {
|
||||
@@ -171,9 +162,8 @@ class RelevantContentFilter(ABC):
|
||||
chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
||||
def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
|
||||
"""Common method for extracting text chunks"""
|
||||
_text_cache = {}
|
||||
def fast_text(element: Tag) -> str:
|
||||
@@ -271,7 +261,38 @@ class RelevantContentFilter(ABC):
|
||||
return str(tag) # Fallback to original if anything fails
|
||||
|
||||
class BM25ContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Tokenizes the corpus and query.
|
||||
4. Applies BM25 algorithm to calculate scores for each chunk.
|
||||
5. Filters out chunks below the threshold.
|
||||
6. Sorts chunks by score in descending order.
|
||||
7. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None)
|
||||
"""
|
||||
def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
|
||||
"""
|
||||
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||||
language (str): Language for stemming (default: 'english').
|
||||
"""
|
||||
super().__init__(user_query=user_query)
|
||||
self.bm25_threshold = bm25_threshold
|
||||
self.priority_tags = {
|
||||
@@ -290,7 +311,20 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
self.stemmer = stemmer(language)
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""Implements content filtering using BM25 algorithm with priority tag handling"""
|
||||
"""
|
||||
Implements content filtering using BM25 algorithm with priority tag handling.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the BM25ContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
@@ -357,15 +391,42 @@ class BM25ContentFilter(RelevantContentFilter):
|
||||
|
||||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class PruningContentFilter(RelevantContentFilter):
|
||||
"""
|
||||
Content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Applies pruning algorithm to calculate scores for each chunk.
|
||||
4. Filters out chunks below the threshold.
|
||||
5. Sorts chunks by score in descending order.
|
||||
6. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
|
||||
Methods:
|
||||
filter_content(self, html: str, min_word_threshold: int = None):
|
||||
"""
|
||||
def __init__(self, user_query: str = None, min_word_threshold: int = None,
|
||||
threshold_type: str = 'fixed', threshold: float = 0.48):
|
||||
super().__init__(user_query)
|
||||
"""
|
||||
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
|
||||
|
||||
Note:
|
||||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||||
|
||||
Args:
|
||||
user_query (str): User query for filtering (optional).
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||||
threshold (float): Fixed threshold value (default: 0.48).
|
||||
"""
|
||||
super().__init__(None)
|
||||
self.min_word_threshold = min_word_threshold
|
||||
self.threshold_type = threshold_type
|
||||
self.threshold = threshold
|
||||
@@ -418,6 +479,20 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
}
|
||||
|
||||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||||
"""
|
||||
Implements content filtering using pruning algorithm with dynamic threshold.
|
||||
|
||||
Note:
|
||||
This method implements the filtering logic for the PruningContentFilter class.
|
||||
It takes HTML content as input and returns a list of filtered text chunks.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered.
|
||||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered text chunks.
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
@@ -444,15 +519,23 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
return content_blocks
|
||||
|
||||
def _remove_comments(self, soup):
|
||||
"""Removes HTML comments"""
|
||||
for element in soup(text=lambda text: isinstance(text, Comment)):
|
||||
element.extract()
|
||||
|
||||
def _remove_unwanted_tags(self, soup):
|
||||
"""Removes unwanted tags"""
|
||||
for tag in self.excluded_tags:
|
||||
for element in soup.find_all(tag):
|
||||
element.decompose()
|
||||
|
||||
def _prune_tree(self, node):
|
||||
"""
|
||||
Prunes the tree starting from the given node.
|
||||
|
||||
Args:
|
||||
node (Tag): The node from which the pruning starts.
|
||||
"""
|
||||
if not node or not hasattr(node, 'name') or node.name is None:
|
||||
return
|
||||
|
||||
@@ -495,6 +578,7 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
self._prune_tree(child)
|
||||
|
||||
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
|
||||
"""Computes the composite score"""
|
||||
if self.min_word_threshold:
|
||||
# Get raw text from metrics node - avoid extra processing
|
||||
text = metrics['node'].get_text(strip=True)
|
||||
@@ -531,6 +615,7 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
return score / total_weight if total_weight > 0 else 0
|
||||
|
||||
def _compute_class_id_weight(self, node):
|
||||
"""Computes the class ID weight"""
|
||||
class_id_score = 0
|
||||
if 'class' in node.attrs:
|
||||
classes = ' '.join(node['class'])
|
||||
|
||||
@@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC):
|
||||
pass
|
||||
|
||||
class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
Class for web content scraping. Perhaps the most important class.
|
||||
|
||||
How it works:
|
||||
1. Extract content from HTML using BeautifulSoup.
|
||||
2. Clean the extracted content using a content cleaning strategy.
|
||||
3. Filter the cleaned content using a content filtering strategy.
|
||||
4. Generate markdown content from the filtered content.
|
||||
5. Return the markdown content.
|
||||
"""
|
||||
|
||||
def __init__(self, logger=None):
|
||||
self.logger = logger
|
||||
|
||||
@@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
log_method(message=message, tag=tag, **kwargs)
|
||||
|
||||
def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return self._scrap(url, html, is_async=False, **kwargs)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entry point for asynchronous content scraping.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
|
||||
|
||||
- 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
|
||||
- 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
|
||||
- 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
|
||||
- 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
def _generate_markdown_content(self,
|
||||
cleaned_html: str,
|
||||
html: str,
|
||||
url: str,
|
||||
success: bool,
|
||||
**kwargs) -> Dict[str, Any]:
|
||||
def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate markdown content from cleaned HTML.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): The cleaned HTML content.
|
||||
html (str): The original HTML content.
|
||||
url (str): The URL of the page.
|
||||
success (bool): Whether the content was successfully cleaned.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing the generated markdown content.
|
||||
"""
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
|
||||
|
||||
if markdown_generator:
|
||||
@@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
|
||||
Args:
|
||||
node (Tag): The root node of the HTML tree.
|
||||
|
||||
Returns:
|
||||
Tag: The flattened HTML tree.
|
||||
"""
|
||||
if isinstance(node, NavigableString):
|
||||
return node
|
||||
if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
|
||||
@@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return node
|
||||
|
||||
def find_closest_parent_with_useful_text(self, tag, **kwargs):
|
||||
"""
|
||||
Find the closest parent with useful text.
|
||||
|
||||
Args:
|
||||
tag (Tag): The starting tag to search from.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
Tag: The closest parent with useful text, or None if not found.
|
||||
"""
|
||||
image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
|
||||
current_tag = tag
|
||||
while current_tag:
|
||||
@@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return None
|
||||
|
||||
def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
|
||||
"""
|
||||
Remove unwanted attributes from an HTML element.
|
||||
|
||||
Args:
|
||||
element (Tag): The HTML element to remove attributes from.
|
||||
important_attrs (list): List of important attributes to keep.
|
||||
keep_data_attributes (bool): Whether to keep data attributes.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
attrs_to_remove = []
|
||||
for attr in element.attrs:
|
||||
if attr not in important_attrs:
|
||||
@@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
del element[attr]
|
||||
|
||||
def process_image(self, img, url, index, total_images, **kwargs):
|
||||
"""
|
||||
Process an image element.
|
||||
|
||||
How it works:
|
||||
1. Check if the image has valid display and inside undesired html elements.
|
||||
2. Score an image for it's usefulness.
|
||||
3. Extract image file metadata to extract size and extension.
|
||||
4. Generate a dictionary with the processed image information.
|
||||
5. Return the processed image information.
|
||||
|
||||
Args:
|
||||
img (Tag): The image element to process.
|
||||
url (str): The URL of the page containing the image.
|
||||
index (int): The index of the image in the list of images.
|
||||
total_images (int): The total number of images in the list.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed image information.
|
||||
"""
|
||||
parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w')
|
||||
if ' ' in u else None}
|
||||
for u in [f"http{p}" for p in s.split("http") if p]]
|
||||
@@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return image_variants if image_variants else None
|
||||
|
||||
def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Process an HTML element.
|
||||
|
||||
How it works:
|
||||
1. Check if the element is an image, video, or audio.
|
||||
2. Extract the element's attributes and content.
|
||||
3. Process the element based on its type.
|
||||
4. Return the processed element information.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page containing the element.
|
||||
element (Tag): The HTML element to process.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the processed element information.
|
||||
"""
|
||||
media = {'images': [], 'videos': [], 'audios': []}
|
||||
internal_links_dict = {}
|
||||
external_links_dict = {}
|
||||
@@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
}
|
||||
|
||||
def _process_element(self, url, element: PageElement, media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
|
||||
"""
|
||||
Process an HTML element.
|
||||
"""
|
||||
try:
|
||||
if isinstance(element, NavigableString):
|
||||
if isinstance(element, Comment):
|
||||
@@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
return False
|
||||
|
||||
def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract content from HTML using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page to scrape.
|
||||
html (str): The HTML content of the page to scrape.
|
||||
word_count_threshold (int): The minimum word count threshold for content extraction.
|
||||
css_selector (str): The CSS selector to use for content extraction.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted content.
|
||||
"""
|
||||
success = True
|
||||
if not html:
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
parser_type = kwargs.get('parser', 'lxml')
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
|
||||
1440
crawl4ai/extraction_strategy.bak.py
Normal file
1440
crawl4ai/extraction_strategy.bak.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -62,29 +62,66 @@ class ExtractionStrategy(ABC):
|
||||
return extracted_content
|
||||
|
||||
class NoExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
|
||||
"""
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML.
|
||||
"""
|
||||
return [{"index": 0, "content": html}]
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies using LLM-based extraction for text data #
|
||||
#######################################################
|
||||
|
||||
|
||||
|
||||
class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
A strategy that uses an LLM to extract meaningful content from the HTML.
|
||||
|
||||
Attributes:
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
chunk_token_threshold: Maximum tokens per chunk.
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
|
||||
instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
Args:
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
chunk_token_threshold: Maximum tokens per chunk.
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
|
||||
:param provider: The provider to use for extraction.
|
||||
:param api_token: The API token for the provider.
|
||||
:param instruction: The instruction to use for the LLM model.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.provider = provider
|
||||
@@ -114,6 +151,22 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
|
||||
|
||||
def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML using an LLM.
|
||||
|
||||
How it works:
|
||||
1. Construct a prompt with variables.
|
||||
2. Make a request to the LLM using the prompt.
|
||||
3. Parse the response and extract blocks or chunks.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage.
|
||||
ix: Index of the block.
|
||||
html: The HTML content of the webpage.
|
||||
|
||||
Returns:
|
||||
A list of extracted blocks or chunks.
|
||||
"""
|
||||
if self.verbose:
|
||||
# print("[LOG] Extracting blocks from URL:", url)
|
||||
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
||||
@@ -180,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
return blocks
|
||||
|
||||
def _merge(self, documents, chunk_token_threshold, overlap):
|
||||
"""
|
||||
Merge documents into sections based on chunk_token_threshold and overlap.
|
||||
"""
|
||||
chunks = []
|
||||
sections = []
|
||||
total_tokens = 0
|
||||
@@ -229,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
||||
|
||||
Args:
|
||||
url: The URL of the webpage.
|
||||
sections: List of sections (strings) to process.
|
||||
|
||||
Returns:
|
||||
A list of extracted blocks or chunks.
|
||||
"""
|
||||
|
||||
merged_sections = self._merge(
|
||||
@@ -285,12 +348,30 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
for i, usage in enumerate(self.usages, 1):
|
||||
print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies using clustering for text data extraction #
|
||||
#######################################################
|
||||
|
||||
class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
|
||||
|
||||
How it works:
|
||||
1. Pre-filter documents using embeddings and semantic_filter.
|
||||
2. Perform clustering using cosine similarity.
|
||||
3. Organize texts by their cluster labels, retaining order.
|
||||
4. Filter clusters by word count.
|
||||
5. Extract meaningful blocks or chunks from the filtered clusters.
|
||||
|
||||
Attributes:
|
||||
semantic_filter (str): A keyword filter for document filtering.
|
||||
word_count_threshold (int): Minimum number of words per cluster.
|
||||
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
||||
linkage_method (str): The linkage method for hierarchical clustering.
|
||||
top_k (int): Number of top categories to extract.
|
||||
model_name (str): The name of the sentence-transformers model.
|
||||
sim_threshold (float): The similarity threshold for clustering.
|
||||
"""
|
||||
def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
@@ -368,11 +449,13 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||||
|
||||
:param documents: List of text chunks (documents).
|
||||
:param semantic_filter: A string containing the keywords for filtering.
|
||||
:param threshold: Cosine similarity threshold for filtering documents.
|
||||
:param at_least_k: Minimum number of documents to return.
|
||||
:return: List of filtered documents, ensuring at least `at_least_k` documents.
|
||||
Args:
|
||||
documents (List[str]): A list of document texts.
|
||||
semantic_filter (str): A keyword filter for document filtering.
|
||||
at_least_k (int): The minimum number of documents to return.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of filtered and sorted document texts.
|
||||
"""
|
||||
|
||||
if not semantic_filter:
|
||||
@@ -410,8 +493,11 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Get BERT embeddings for a list of sentences.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of embeddings.
|
||||
Args:
|
||||
sentences (List[str]): A list of text chunks (sentences).
|
||||
|
||||
Returns:
|
||||
NumPy array of embeddings.
|
||||
"""
|
||||
# if self.buffer_embeddings.any() and not bypass_buffer:
|
||||
# return self.buffer_embeddings
|
||||
@@ -455,8 +541,11 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Perform hierarchical clustering on sentences and return cluster labels.
|
||||
|
||||
:param sentences: List of text chunks (sentences).
|
||||
:return: NumPy array of cluster labels.
|
||||
Args:
|
||||
sentences (List[str]): A list of text chunks (sentences).
|
||||
|
||||
Returns:
|
||||
NumPy array of cluster labels.
|
||||
"""
|
||||
# Get embeddings
|
||||
from scipy.cluster.hierarchy import linkage, fcluster
|
||||
@@ -472,12 +561,15 @@ class CosineStrategy(ExtractionStrategy):
|
||||
labels = fcluster(linked, self.max_dist, criterion='distance')
|
||||
return labels
|
||||
|
||||
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]):
|
||||
def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
|
||||
"""
|
||||
Filter clusters to remove those with a word count below the threshold.
|
||||
|
||||
:param clusters: Dictionary of clusters.
|
||||
:return: Filtered dictionary of clusters.
|
||||
Args:
|
||||
clusters (Dict[int, List[str]]): Dictionary of clusters.
|
||||
|
||||
Returns:
|
||||
Dict[int, List[str]]: Filtered dictionary of clusters.
|
||||
"""
|
||||
filtered_clusters = {}
|
||||
for cluster_id, texts in clusters.items():
|
||||
@@ -496,9 +588,12 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Extract clusters from HTML content using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:return: A list of dictionaries representing the clusters.
|
||||
Args:
|
||||
url (str): The URL of the webpage.
|
||||
html (str): The HTML content of the webpage.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of processed JSON blocks.
|
||||
"""
|
||||
# Assume `html` is a list of text chunks for this strategy
|
||||
t = time.time()
|
||||
@@ -560,159 +655,85 @@ class CosineStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Process sections using hierarchical clustering.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
Args:
|
||||
url (str): The URL of the webpage.
|
||||
sections (List[str]): List of sections (strings) to process.
|
||||
|
||||
Returns:
|
||||
"""
|
||||
# This strategy processes all sections together
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
|
||||
#######################################################
|
||||
# Strategies based on the extraction of specific types #
|
||||
#######################################################
|
||||
|
||||
class TopicExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, num_keywords: int = 3, **kwargs):
|
||||
"""
|
||||
Initialize the topic extraction strategy with parameters for topic segmentation.
|
||||
|
||||
:param num_keywords: Number of keywords to represent each topic segment.
|
||||
"""
|
||||
import nltk
|
||||
super().__init__(**kwargs)
|
||||
self.num_keywords = num_keywords
|
||||
self.tokenizer = nltk.TextTilingTokenizer()
|
||||
|
||||
def extract_keywords(self, text: str) -> List[str]:
|
||||
"""
|
||||
Extract keywords from a given text segment using simple frequency analysis.
|
||||
|
||||
:param text: The text segment from which to extract keywords.
|
||||
:return: A list of keyword strings.
|
||||
"""
|
||||
import nltk
|
||||
# Tokenize the text and compute word frequency
|
||||
words = nltk.word_tokenize(text)
|
||||
freq_dist = nltk.FreqDist(words)
|
||||
# Get the most common words as keywords
|
||||
keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
|
||||
return keywords
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param html: The HTML content of the webpage.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries representing the topics.
|
||||
"""
|
||||
# Use TextTiling to segment the text into topics
|
||||
segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||||
|
||||
# Prepare the output as a list of dictionaries
|
||||
topic_list = []
|
||||
for i, segment in enumerate(segmented_topics):
|
||||
# Extract keywords for each segment
|
||||
keywords = self.extract_keywords(segment)
|
||||
topic_list.append({
|
||||
"index": i,
|
||||
"content": segment,
|
||||
"keywords": keywords
|
||||
})
|
||||
|
||||
return topic_list
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process sections using topic segmentation and keyword extraction.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to process.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of processed JSON blocks.
|
||||
"""
|
||||
# Concatenate sections into a single text for coherent topic segmentation
|
||||
|
||||
|
||||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||||
|
||||
class ContentSummarizationStrategy(ExtractionStrategy):
|
||||
def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
|
||||
"""
|
||||
Initialize the content summarization strategy with a specific model.
|
||||
|
||||
:param model_name: The model to use for summarization.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
from transformers import pipeline
|
||||
self.summarizer = pipeline("summarization", model=model_name)
|
||||
|
||||
def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Summarize a single section of text.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param text: A section of text to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A dictionary with the summary.
|
||||
"""
|
||||
try:
|
||||
summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
|
||||
return {"summary": summary[0]['summary_text']}
|
||||
except Exception as e:
|
||||
print(f"Error summarizing text: {e}")
|
||||
return {"summary": text} # Fallback to original text if summarization fails
|
||||
|
||||
def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Process each section in parallel to produce summaries.
|
||||
|
||||
:param url: The URL of the webpage.
|
||||
:param sections: List of sections (strings) to summarize.
|
||||
:param provider: The provider to be used for extraction (not used here).
|
||||
:param api_token: Optional API token for the provider (not used here).
|
||||
:return: A list of dictionaries with summaries for each section.
|
||||
"""
|
||||
# Use a ThreadPoolExecutor to summarize in parallel
|
||||
summaries = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
# Create a future for each section's summarization
|
||||
future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
|
||||
for future in as_completed(future_to_section):
|
||||
section_index = future_to_section[future]
|
||||
try:
|
||||
summary_result = future.result()
|
||||
summaries.append((section_index, summary_result))
|
||||
except Exception as e:
|
||||
print(f"Error processing section {section_index}: {e}")
|
||||
summaries.append((section_index, {"summary": sections[section_index]})) # Fallback to original text
|
||||
|
||||
# Sort summaries by the original section index to maintain order
|
||||
summaries.sort(key=lambda x: x[0])
|
||||
return [summary for _, summary in summaries]
|
||||
|
||||
|
||||
#######################################################
|
||||
# New extraction strategies for JSON-based extraction #
|
||||
#######################################################
|
||||
|
||||
class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Abstract base class for extracting structured JSON from HTML content.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content using the `_parse_html` method.
|
||||
2. Uses a schema to define base selectors, fields, and transformations.
|
||||
3. Extracts data hierarchically, supporting nested fields and lists.
|
||||
4. Handles computed fields with expressions or functions.
|
||||
|
||||
Attributes:
|
||||
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
|
||||
_extract_item(element, fields): Extracts fields from a single element.
|
||||
_extract_single_field(element, field): Extracts a single field based on its type.
|
||||
_apply_transform(value, transform): Applies a transformation to a value.
|
||||
_compute_field(item, field): Computes a field value using an expression or function.
|
||||
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
|
||||
|
||||
Abstract Methods:
|
||||
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
|
||||
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
|
||||
_get_elements(element, selector): Retrieves child elements using a selector.
|
||||
_get_element_text(element): Extracts text content from an element.
|
||||
_get_element_html(element): Extracts raw HTML from an element.
|
||||
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
|
||||
"""
|
||||
|
||||
|
||||
DEL = '\n'
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
"""
|
||||
Initialize the JSON element extraction strategy with a schema.
|
||||
|
||||
Args:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
self.verbose = kwargs.get('verbose', False)
|
||||
|
||||
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Extract structured data from HTML content.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML content using the `_parse_html` method.
|
||||
2. Identifies base elements using the schema's base selector.
|
||||
3. Extracts fields from each base element using `_extract_item`.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page being processed.
|
||||
html_content (str): The raw HTML content to parse and extract.
|
||||
*q: Additional positional arguments.
|
||||
**kwargs: Additional keyword arguments for custom extraction.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
|
||||
"""
|
||||
|
||||
parsed_html = self._parse_html(html_content)
|
||||
base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
|
||||
|
||||
@@ -772,6 +793,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return field.get('default')
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
"""
|
||||
Extract a single field based on its type.
|
||||
|
||||
How it works:
|
||||
1. Selects the target element using the field's selector.
|
||||
2. Extracts the field value based on its type (e.g., text, attribute, regex).
|
||||
3. Applies transformations if defined in the schema.
|
||||
|
||||
Args:
|
||||
element: The base element to extract the field from.
|
||||
field (Dict[str, Any]): The field definition in the schema.
|
||||
|
||||
Returns:
|
||||
Any: The extracted field value.
|
||||
"""
|
||||
|
||||
if 'selector' in field:
|
||||
selected = self._get_elements(element, field['selector'])
|
||||
if not selected:
|
||||
@@ -806,6 +843,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return item
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
"""
|
||||
Extracts fields from a given element.
|
||||
|
||||
How it works:
|
||||
1. Iterates through the fields defined in the schema.
|
||||
2. Handles computed, single, and nested field types.
|
||||
3. Updates the item dictionary with extracted field values.
|
||||
|
||||
Args:
|
||||
element: The base element to extract fields from.
|
||||
fields (List[Dict[str, Any]]): The list of fields to extract.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary representing the extracted item.
|
||||
"""
|
||||
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
@@ -817,6 +870,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
"""
|
||||
Apply a transformation to a value.
|
||||
|
||||
How it works:
|
||||
1. Checks the transformation type (e.g., `lowercase`, `strip`).
|
||||
2. Applies the transformation to the value.
|
||||
3. Returns the transformed value.
|
||||
|
||||
Args:
|
||||
value (str): The value to transform.
|
||||
transform (str): The type of transformation to apply.
|
||||
|
||||
Returns:
|
||||
str: The transformed value.
|
||||
"""
|
||||
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
@@ -837,6 +906,23 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Run the extraction strategy on a combined HTML content.
|
||||
|
||||
How it works:
|
||||
1. Combines multiple HTML sections using the `DEL` delimiter.
|
||||
2. Calls the `extract` method with the combined HTML.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the page being processed.
|
||||
sections (List[str]): A list of HTML sections.
|
||||
*q: Additional positional arguments.
|
||||
**kwargs: Additional keyword arguments for custom extraction.
|
||||
|
||||
Returns:
|
||||
List[Dict[str, Any]]: A list of extracted items.
|
||||
"""
|
||||
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
|
||||
@@ -856,6 +942,27 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
pass
|
||||
|
||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content with BeautifulSoup.
|
||||
2. Selects elements using CSS selectors defined in the schema.
|
||||
3. Extracts field data and applies transformations as defined.
|
||||
|
||||
Attributes:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
|
||||
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
|
||||
_get_elements(element, selector): Selects child elements using a CSS selector.
|
||||
_get_element_text(element): Extracts text content from a BeautifulSoup element.
|
||||
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
|
||||
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
|
||||
"""
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(schema, **kwargs)
|
||||
@@ -880,6 +987,28 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
return element.get(attribute)
|
||||
|
||||
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
||||
|
||||
How it works:
|
||||
1. Parses HTML content into an lxml tree.
|
||||
2. Selects elements using XPath expressions.
|
||||
3. Converts CSS selectors to XPath when needed.
|
||||
|
||||
Attributes:
|
||||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||||
verbose (bool): Enables verbose logging for debugging purposes.
|
||||
|
||||
Methods:
|
||||
_parse_html(html_content): Parses HTML content into an lxml tree.
|
||||
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
|
||||
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
|
||||
_get_elements(element, selector): Selects child elements using an XPath selector.
|
||||
_get_element_text(element): Extracts text content from an lxml element.
|
||||
_get_element_html(element): Extracts the raw HTML content of an lxml element.
|
||||
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
|
||||
"""
|
||||
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(schema, **kwargs)
|
||||
@@ -921,259 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
return element.get(attribute)
|
||||
|
||||
|
||||
class _JsonCssExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
|
||||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
base_elements = soup.select(self.schema['baseSelector'])
|
||||
|
||||
results = []
|
||||
for element in base_elements:
|
||||
# Extract base element attributes first
|
||||
item = {}
|
||||
if 'baseFields' in self.schema:
|
||||
for field in self.schema['baseFields']:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
|
||||
# Then extract child fields
|
||||
field_data = self._extract_item(element, self.schema['fields'])
|
||||
item.update(field_data)
|
||||
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
|
||||
def _extract_field(self, element, field):
|
||||
try:
|
||||
if field['type'] == 'nested':
|
||||
nested_element = element.select_one(field['selector'])
|
||||
return self._extract_item(nested_element, field['fields']) if nested_element else {}
|
||||
|
||||
if field['type'] == 'list':
|
||||
elements = element.select(field['selector'])
|
||||
return [self._extract_list_item(el, field['fields']) for el in elements]
|
||||
|
||||
if field['type'] == 'nested_list':
|
||||
elements = element.select(field['selector'])
|
||||
return [self._extract_item(el, field['fields']) for el in elements]
|
||||
|
||||
return self._extract_single_field(element, field)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error extracting field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def _extract_list_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
if 'selector' in field:
|
||||
selected = element.select_one(field['selector'])
|
||||
if not selected:
|
||||
return field.get('default')
|
||||
else:
|
||||
selected = element
|
||||
|
||||
value = None
|
||||
if field['type'] == 'text':
|
||||
value = selected.get_text(strip=True)
|
||||
elif field['type'] == 'attribute':
|
||||
value = selected.get(field['attribute'])
|
||||
elif field['type'] == 'html':
|
||||
value = str(selected)
|
||||
elif field['type'] == 'regex':
|
||||
text = selected.get_text(strip=True)
|
||||
match = re.search(field['pattern'], text)
|
||||
value = match.group(1) if match else None
|
||||
|
||||
if 'transform' in field:
|
||||
value = self._apply_transform(value, field['transform'])
|
||||
|
||||
return value if value is not None else field.get('default')
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
value = self._compute_field(item, field)
|
||||
else:
|
||||
value = self._extract_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
return value.upper()
|
||||
elif transform == 'strip':
|
||||
return value.strip()
|
||||
return value
|
||||
|
||||
def _compute_field(self, item, field):
|
||||
try:
|
||||
if 'expression' in field:
|
||||
return eval(field['expression'], {}, item)
|
||||
elif 'function' in field:
|
||||
return field['function'](item)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error computing field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
class _JsonXPathExtractionStrategy(ExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs['input_format'] = 'html' # Force HTML input
|
||||
super().__init__(**kwargs)
|
||||
self.schema = schema
|
||||
|
||||
def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
tree = html.fromstring(html_content)
|
||||
base_xpath = self.schema['baseSelector']
|
||||
base_elements = tree.xpath(base_xpath)
|
||||
|
||||
results = []
|
||||
for element in base_elements:
|
||||
# Extract base element attributes first
|
||||
item = {}
|
||||
if 'baseFields' in self.schema:
|
||||
for field in self.schema['baseFields']:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
|
||||
# Then extract child fields
|
||||
field_data = self._extract_item(element, self.schema['fields'])
|
||||
item.update(field_data)
|
||||
|
||||
results.append(item)
|
||||
|
||||
return results
|
||||
|
||||
def _css_to_xpath(self, css_selector: str) -> str:
|
||||
"""Convert CSS selector to XPath if needed"""
|
||||
if '/' in css_selector: # Already an XPath
|
||||
return css_selector
|
||||
else:
|
||||
# Fallback to basic conversion for common cases
|
||||
return self._basic_css_to_xpath(css_selector)
|
||||
|
||||
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
||||
"""Basic CSS to XPath conversion for common cases"""
|
||||
# Handle basic cases
|
||||
if ' > ' in css_selector:
|
||||
parts = css_selector.split(' > ')
|
||||
return '//' + '/'.join(parts)
|
||||
if ' ' in css_selector:
|
||||
parts = css_selector.split(' ')
|
||||
return '//' + '//'.join(parts)
|
||||
return '//' + css_selector
|
||||
|
||||
def _extract_field(self, element, field):
|
||||
try:
|
||||
if field['type'] == 'nested':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
|
||||
return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
|
||||
|
||||
if field['type'] == 'list':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
elements = element.xpath(xpath)
|
||||
return [self._extract_list_item(el, field['fields']) for el in elements]
|
||||
|
||||
if field['type'] == 'nested_list':
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
elements = element.xpath(xpath)
|
||||
return [self._extract_item(el, field['fields']) for el in elements]
|
||||
|
||||
return self._extract_single_field(element, field)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error extracting field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def _extract_list_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
value = self._extract_single_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _extract_single_field(self, element, field):
|
||||
if 'selector' in field:
|
||||
xpath = self._css_to_xpath(field['selector'])
|
||||
selected = element.xpath(xpath)
|
||||
if not selected:
|
||||
return field.get('default')
|
||||
selected = selected[0]
|
||||
else:
|
||||
selected = element
|
||||
|
||||
value = None
|
||||
if field['type'] == 'text':
|
||||
value = ''.join(selected.xpath('.//text()')).strip()
|
||||
elif field['type'] == 'attribute':
|
||||
value = selected.get(field['attribute'])
|
||||
elif field['type'] == 'html':
|
||||
value = etree.tostring(selected, encoding='unicode')
|
||||
elif field['type'] == 'regex':
|
||||
text = ''.join(selected.xpath('.//text()')).strip()
|
||||
match = re.search(field['pattern'], text)
|
||||
value = match.group(1) if match else None
|
||||
|
||||
if 'transform' in field:
|
||||
value = self._apply_transform(value, field['transform'])
|
||||
|
||||
return value if value is not None else field.get('default')
|
||||
|
||||
def _extract_item(self, element, fields):
|
||||
item = {}
|
||||
for field in fields:
|
||||
if field['type'] == 'computed':
|
||||
value = self._compute_field(item, field)
|
||||
else:
|
||||
value = self._extract_field(element, field)
|
||||
if value is not None:
|
||||
item[field['name']] = value
|
||||
return item
|
||||
|
||||
def _apply_transform(self, value, transform):
|
||||
if transform == 'lowercase':
|
||||
return value.lower()
|
||||
elif transform == 'uppercase':
|
||||
return value.upper()
|
||||
elif transform == 'strip':
|
||||
return value.strip()
|
||||
return value
|
||||
|
||||
def _compute_field(self, item, field):
|
||||
try:
|
||||
if 'expression' in field:
|
||||
return eval(field['expression'], {}, item)
|
||||
elif 'function' in field:
|
||||
return field['function'](item)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error computing field {field['name']}: {str(e)}")
|
||||
return field.get('default')
|
||||
|
||||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||||
combined_html = self.DEL.join(sections)
|
||||
return self.extract(url, combined_html, **kwargs)
|
||||
|
||||
@@ -38,11 +38,44 @@ class MarkdownGenerationStrategy(ABC):
|
||||
pass
|
||||
|
||||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
"""Default implementation of markdown generation strategy."""
|
||||
"""
|
||||
Default implementation of markdown generation strategy.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
|
||||
super().__init__(content_filter, options)
|
||||
|
||||
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
|
||||
"""
|
||||
Convert links in markdown to citations.
|
||||
|
||||
How it works:
|
||||
1. Find all links in the markdown.
|
||||
2. Convert links to citations.
|
||||
3. Return converted markdown and references markdown.
|
||||
|
||||
Note:
|
||||
This function uses a regex pattern to find links in markdown.
|
||||
|
||||
Args:
|
||||
markdown (str): Markdown text.
|
||||
base_url (str): Base URL for URL joins.
|
||||
|
||||
Returns:
|
||||
Tuple[str, str]: Converted markdown and references markdown.
|
||||
"""
|
||||
link_map = {}
|
||||
url_cache = {} # Cache for URL joins
|
||||
parts = []
|
||||
@@ -90,7 +123,26 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True,
|
||||
**kwargs) -> MarkdownGenerationResult:
|
||||
"""Generate markdown with citations from cleaned HTML."""
|
||||
"""
|
||||
Generate markdown with citations from cleaned HTML.
|
||||
|
||||
How it works:
|
||||
1. Generate raw markdown from cleaned HTML.
|
||||
2. Convert links to citations.
|
||||
3. Generate fit markdown if content filter is provided.
|
||||
4. Return MarkdownGenerationResult.
|
||||
|
||||
Args:
|
||||
cleaned_html (str): Cleaned HTML content.
|
||||
base_url (str): Base URL for URL joins.
|
||||
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
||||
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
||||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||||
citations (bool): Whether to generate citations.
|
||||
|
||||
Returns:
|
||||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||||
"""
|
||||
# Initialize HTML2Text with options
|
||||
h = CustomHTML2Text()
|
||||
if html2text_options:
|
||||
|
||||
@@ -13,13 +13,34 @@ from pathlib import Path
|
||||
class SSLCertificate:
|
||||
"""
|
||||
A class representing an SSL certificate with methods to export in various formats.
|
||||
|
||||
Attributes:
|
||||
cert_info (Dict[str, Any]): The certificate information.
|
||||
|
||||
Methods:
|
||||
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
||||
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
||||
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
||||
export_as_pem() -> str: Export the certificate as PEM format.
|
||||
export_as_der() -> bytes: Export the certificate as DER format.
|
||||
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
||||
export_as_text() -> str: Export the certificate as text format.
|
||||
"""
|
||||
def __init__(self, cert_info: Dict[str, Any]):
|
||||
self._cert_info = self._decode_cert_data(cert_info)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
|
||||
"""Create SSLCertificate instance from a URL."""
|
||||
"""
|
||||
Create SSLCertificate instance from a URL.
|
||||
|
||||
Args:
|
||||
url (str): URL of the website.
|
||||
timeout (int): Timeout for the connection (default: 10).
|
||||
|
||||
Returns:
|
||||
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ':' in hostname:
|
||||
@@ -73,7 +94,15 @@ class SSLCertificate:
|
||||
return data
|
||||
|
||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as JSON."""
|
||||
"""
|
||||
Export certificate as JSON.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the JSON file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: JSON string if successful, None otherwise.
|
||||
"""
|
||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding='utf-8')
|
||||
@@ -81,7 +110,15 @@ class SSLCertificate:
|
||||
return json_str
|
||||
|
||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as PEM."""
|
||||
"""
|
||||
Export certificate as PEM.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the PEM file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: PEM string if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
@@ -100,7 +137,15 @@ class SSLCertificate:
|
||||
return None
|
||||
|
||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""Export certificate as DER."""
|
||||
"""
|
||||
Export certificate as DER.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the DER file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[bytes]: DER bytes if successful, None otherwise.
|
||||
"""
|
||||
try:
|
||||
der_data = base64.b64decode(self._cert_info['raw_cert'])
|
||||
if filepath:
|
||||
|
||||
@@ -4,6 +4,34 @@ import re
|
||||
|
||||
|
||||
class UserAgentGenerator:
|
||||
"""
|
||||
Generate random user agents with specified constraints.
|
||||
|
||||
Attributes:
|
||||
desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
|
||||
mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
|
||||
browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
|
||||
rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
|
||||
chrome_versions (list): A list of possible Chrome browser versions.
|
||||
firefox_versions (list): A list of possible Firefox browser versions.
|
||||
edge_versions (list): A list of possible Edge browser versions.
|
||||
safari_versions (list): A list of possible Safari browser versions.
|
||||
ios_versions (list): A list of possible iOS browser versions.
|
||||
android_versions (list): A list of possible Android browser versions.
|
||||
|
||||
Methods:
|
||||
generate_user_agent(
|
||||
platform: Literal["desktop", "mobile"] = "desktop",
|
||||
browser: str = "chrome",
|
||||
rendering_engine: str = "chrome_webkit",
|
||||
chrome_version: Optional[str] = None,
|
||||
firefox_version: Optional[str] = None,
|
||||
edge_version: Optional[str] = None,
|
||||
safari_version: Optional[str] = None,
|
||||
ios_version: Optional[str] = None,
|
||||
android_version: Optional[str] = None
|
||||
): Generates a random user agent string based on the specified parameters.
|
||||
"""
|
||||
def __init__(self):
|
||||
# Previous platform definitions remain the same...
|
||||
self.desktop_platforms = {
|
||||
@@ -105,7 +133,21 @@ class UserAgentGenerator:
|
||||
]
|
||||
|
||||
def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
|
||||
"""Get a valid combination of browser versions"""
|
||||
"""
|
||||
Get a valid combination of browser versions.
|
||||
|
||||
How it works:
|
||||
1. Check if the number of browsers is supported.
|
||||
2. Randomly choose a combination of browsers.
|
||||
3. Iterate through the combination and add browser versions.
|
||||
4. Return the browser stack.
|
||||
|
||||
Args:
|
||||
num_browsers: Number of browser specifications (1-3)
|
||||
|
||||
Returns:
|
||||
List[str]: A list of browser versions.
|
||||
"""
|
||||
if num_browsers not in self.browser_combinations:
|
||||
raise ValueError(f"Unsupported number of browsers: {num_browsers}")
|
||||
|
||||
|
||||
@@ -25,64 +25,91 @@ from functools import wraps
|
||||
class InvalidCSSSelectorError(Exception):
|
||||
pass
|
||||
|
||||
def create_box_message(
|
||||
message: str,
|
||||
type: str = "info",
|
||||
width: int = 120,
|
||||
add_newlines: bool = True,
|
||||
double_line: bool = False
|
||||
) -> str:
|
||||
init()
|
||||
|
||||
# Define border and text colors for different types
|
||||
styles = {
|
||||
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
|
||||
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
|
||||
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
|
||||
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
|
||||
}
|
||||
|
||||
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
|
||||
|
||||
# Define box characters based on line style
|
||||
box_chars = {
|
||||
"single": ("─", "│", "┌", "┐", "└", "┘"),
|
||||
"double": ("═", "║", "╔", "╗", "╚", "╝")
|
||||
}
|
||||
line_style = "double" if double_line else "single"
|
||||
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
|
||||
|
||||
# Process lines with lighter text color
|
||||
formatted_lines = []
|
||||
raw_lines = message.split('\n')
|
||||
|
||||
if raw_lines:
|
||||
first_line = f"{prefix} {raw_lines[0].strip()}"
|
||||
wrapped_first = textwrap.fill(first_line, width=width-4)
|
||||
formatted_lines.extend(wrapped_first.split('\n'))
|
||||
|
||||
for line in raw_lines[1:]:
|
||||
if line.strip():
|
||||
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
|
||||
formatted_lines.extend(wrapped.split('\n'))
|
||||
else:
|
||||
formatted_lines.append("")
|
||||
|
||||
# Create the box with colored borders and lighter text
|
||||
horizontal_line = h_line * (width - 1)
|
||||
box = [
|
||||
f"{border_color}{tl}{horizontal_line}{tr}",
|
||||
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
|
||||
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
|
||||
]
|
||||
|
||||
result = "\n".join(box)
|
||||
if add_newlines:
|
||||
result = f"\n{result}\n"
|
||||
|
||||
return result
|
||||
def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
|
||||
"""
|
||||
Create a styled message box with colored borders and formatted text.
|
||||
|
||||
How it works:
|
||||
1. Determines box style and colors based on the message type (e.g., info, warning).
|
||||
2. Wraps text to fit within the specified width.
|
||||
3. Constructs a box using characters (single or double lines) with appropriate formatting.
|
||||
4. Adds optional newlines before and after the box.
|
||||
|
||||
Args:
|
||||
message (str): The message to display inside the box.
|
||||
type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
|
||||
width (int): Width of the box. Defaults to 120.
|
||||
add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
|
||||
double_line (bool): Whether to use double lines for the box border. Defaults to False.
|
||||
|
||||
Returns:
|
||||
str: A formatted string containing the styled message box.
|
||||
"""
|
||||
|
||||
init()
|
||||
|
||||
# Define border and text colors for different types
|
||||
styles = {
|
||||
"warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
|
||||
"info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
|
||||
"success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
|
||||
"error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
|
||||
}
|
||||
|
||||
border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
|
||||
|
||||
# Define box characters based on line style
|
||||
box_chars = {
|
||||
"single": ("─", "│", "┌", "┐", "└", "┘"),
|
||||
"double": ("═", "║", "╔", "╗", "╚", "╝")
|
||||
}
|
||||
line_style = "double" if double_line else "single"
|
||||
h_line, v_line, tl, tr, bl, br = box_chars[line_style]
|
||||
|
||||
# Process lines with lighter text color
|
||||
formatted_lines = []
|
||||
raw_lines = message.split('\n')
|
||||
|
||||
if raw_lines:
|
||||
first_line = f"{prefix} {raw_lines[0].strip()}"
|
||||
wrapped_first = textwrap.fill(first_line, width=width-4)
|
||||
formatted_lines.extend(wrapped_first.split('\n'))
|
||||
|
||||
for line in raw_lines[1:]:
|
||||
if line.strip():
|
||||
wrapped = textwrap.fill(f" {line.strip()}", width=width-4)
|
||||
formatted_lines.extend(wrapped.split('\n'))
|
||||
else:
|
||||
formatted_lines.append("")
|
||||
|
||||
# Create the box with colored borders and lighter text
|
||||
horizontal_line = h_line * (width - 1)
|
||||
box = [
|
||||
f"{border_color}{tl}{horizontal_line}{tr}",
|
||||
*[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
|
||||
f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
|
||||
]
|
||||
|
||||
result = "\n".join(box)
|
||||
if add_newlines:
|
||||
result = f"\n{result}\n"
|
||||
|
||||
return result
|
||||
|
||||
def calculate_semaphore_count():
|
||||
"""
|
||||
Calculate the optimal semaphore count based on system resources.
|
||||
|
||||
How it works:
|
||||
1. Determines the number of CPU cores and total system memory.
|
||||
2. Sets a base count as half of the available CPU cores.
|
||||
3. Limits the count based on memory, assuming 2GB per semaphore instance.
|
||||
4. Returns the minimum value between CPU and memory-based limits.
|
||||
|
||||
Returns:
|
||||
int: The calculated semaphore count.
|
||||
"""
|
||||
|
||||
cpu_count = os.cpu_count()
|
||||
memory_gb = get_system_memory() / (1024 ** 3) # Convert to GB
|
||||
base_count = max(1, cpu_count // 2)
|
||||
@@ -90,6 +117,21 @@ def calculate_semaphore_count():
|
||||
return min(base_count, memory_based_cap)
|
||||
|
||||
def get_system_memory():
|
||||
"""
|
||||
Get the total system memory in bytes.
|
||||
|
||||
How it works:
|
||||
1. Detects the operating system.
|
||||
2. Reads memory information from system-specific commands or files.
|
||||
3. Converts the memory to bytes for uniformity.
|
||||
|
||||
Returns:
|
||||
int: The total system memory in bytes.
|
||||
|
||||
Raises:
|
||||
OSError: If the operating system is unsupported.
|
||||
"""
|
||||
|
||||
system = platform.system()
|
||||
if system == "Linux":
|
||||
with open('/proc/meminfo', 'r') as mem:
|
||||
@@ -124,6 +166,18 @@ def get_system_memory():
|
||||
raise OSError("Unsupported operating system")
|
||||
|
||||
def get_home_folder():
|
||||
"""
|
||||
Get or create the home folder for Crawl4AI configuration and cache.
|
||||
|
||||
How it works:
|
||||
1. Uses environment variables or defaults to the user's home directory.
|
||||
2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
|
||||
3. Returns the path to the home folder.
|
||||
|
||||
Returns:
|
||||
str: The path to the Crawl4AI home folder.
|
||||
"""
|
||||
|
||||
home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
|
||||
os.makedirs(home_folder, exist_ok=True)
|
||||
os.makedirs(f"{home_folder}/cache", exist_ok=True)
|
||||
@@ -194,6 +248,20 @@ def split_and_parse_json_objects(json_string):
|
||||
return parsed_objects, unparsed_segments
|
||||
|
||||
def sanitize_html(html):
|
||||
"""
|
||||
Sanitize an HTML string by escaping quotes.
|
||||
|
||||
How it works:
|
||||
1. Replaces all unwanted and special characters with an empty string.
|
||||
2. Escapes double and single quotes for safe usage.
|
||||
|
||||
Args:
|
||||
html (str): The HTML string to sanitize.
|
||||
|
||||
Returns:
|
||||
str: The sanitized HTML string.
|
||||
"""
|
||||
|
||||
# Replace all unwanted and special characters with an empty string
|
||||
sanitized_html = html
|
||||
# sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
|
||||
@@ -248,6 +316,23 @@ def escape_json_string(s):
|
||||
return s
|
||||
|
||||
def replace_inline_tags(soup, tags, only_text=False):
|
||||
"""
|
||||
Replace inline HTML tags with Markdown-style equivalents.
|
||||
|
||||
How it works:
|
||||
1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
|
||||
2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
|
||||
3. Optionally replaces tags with their text content only.
|
||||
|
||||
Args:
|
||||
soup (BeautifulSoup): Parsed HTML content.
|
||||
tags (List[str]): List of tags to replace.
|
||||
only_text (bool): Whether to replace tags with plain text. Defaults to False.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: Updated BeautifulSoup object with replaced tags.
|
||||
"""
|
||||
|
||||
tag_replacements = {
|
||||
'b': lambda tag: f"**{tag.text}**",
|
||||
'i': lambda tag: f"*{tag.text}*",
|
||||
@@ -292,6 +377,26 @@ def replace_inline_tags(soup, tags, only_text=False):
|
||||
# return soup
|
||||
|
||||
def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
|
||||
"""
|
||||
Extract structured content, media, and links from website HTML.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML content using BeautifulSoup.
|
||||
2. Extracts internal/external links and media (images, videos, audios).
|
||||
3. Cleans the content by removing unwanted tags and attributes.
|
||||
4. Converts cleaned HTML to Markdown.
|
||||
5. Collects metadata and returns the extracted information.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
|
||||
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
|
||||
"""
|
||||
|
||||
try:
|
||||
if not html:
|
||||
return None
|
||||
@@ -762,6 +867,27 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
|
||||
}
|
||||
|
||||
def extract_metadata(html, soup=None):
|
||||
"""
|
||||
Extract optimized content, media, and links from website HTML.
|
||||
|
||||
How it works:
|
||||
1. Similar to `get_content_of_website`, but optimized for performance.
|
||||
2. Filters and scores images for usefulness.
|
||||
3. Extracts contextual descriptions for media files.
|
||||
4. Handles excluded tags and CSS selectors.
|
||||
5. Cleans HTML and converts it to Markdown.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
|
||||
css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
|
||||
**kwargs: Additional options for customization.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
|
||||
"""
|
||||
|
||||
metadata = {}
|
||||
|
||||
if not html and not soup:
|
||||
@@ -809,10 +935,35 @@ def extract_metadata(html, soup=None):
|
||||
return metadata
|
||||
|
||||
def extract_xml_tags(string):
|
||||
"""
|
||||
Extracts XML tags from a string.
|
||||
|
||||
Args:
|
||||
string (str): The input string containing XML tags.
|
||||
|
||||
Returns:
|
||||
List[str]: A list of XML tags extracted from the input string.
|
||||
"""
|
||||
tags = re.findall(r'<(\w+)>', string)
|
||||
return list(set(tags))
|
||||
|
||||
def extract_xml_data(tags, string):
|
||||
"""
|
||||
Extract data for specified XML tags from a string.
|
||||
|
||||
How it works:
|
||||
1. Searches the string for each tag using regex.
|
||||
2. Extracts the content within the tags.
|
||||
3. Returns a dictionary of tag-content pairs.
|
||||
|
||||
Args:
|
||||
tags (List[str]): The list of XML tags to extract.
|
||||
string (str): The input string containing XML data.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
|
||||
"""
|
||||
|
||||
data = {}
|
||||
|
||||
for tag in tags:
|
||||
@@ -833,6 +984,26 @@ def perform_completion_with_backoff(
|
||||
base_url=None,
|
||||
**kwargs
|
||||
):
|
||||
"""
|
||||
Perform an API completion request with exponential backoff.
|
||||
|
||||
How it works:
|
||||
1. Sends a completion request to the API.
|
||||
2. Retries on rate-limit errors with exponential delays.
|
||||
3. Returns the API response or an error after all retries.
|
||||
|
||||
Args:
|
||||
provider (str): The name of the API provider.
|
||||
prompt_with_variables (str): The input prompt for the completion request.
|
||||
api_token (str): The API token for authentication.
|
||||
json_response (bool): Whether to request a JSON response. Defaults to False.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
**kwargs: Additional arguments for the API request.
|
||||
|
||||
Returns:
|
||||
dict: The API response or an error message after all retries.
|
||||
"""
|
||||
|
||||
from litellm import completion
|
||||
from litellm.exceptions import RateLimitError
|
||||
max_attempts = 3
|
||||
@@ -878,6 +1049,25 @@ def perform_completion_with_backoff(
|
||||
}]
|
||||
|
||||
def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
|
||||
"""
|
||||
Extract content blocks from website HTML using an AI provider.
|
||||
|
||||
How it works:
|
||||
1. Prepares a prompt by sanitizing and escaping HTML.
|
||||
2. Sends the prompt to an AI provider with optional retries.
|
||||
3. Parses the response to extract structured blocks or errors.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
html (str): The HTML content of the website.
|
||||
provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
|
||||
api_token (Optional[str]): The API token for authentication. Defaults to None.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of extracted content blocks.
|
||||
"""
|
||||
|
||||
# api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
|
||||
|
||||
@@ -914,6 +1104,23 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
|
||||
return blocks
|
||||
|
||||
def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
|
||||
"""
|
||||
Extract content blocks from a batch of website HTMLs.
|
||||
|
||||
How it works:
|
||||
1. Prepares prompts for each URL and HTML pair.
|
||||
2. Sends the prompts to the AI provider in a batch request.
|
||||
3. Parses the responses to extract structured blocks or errors.
|
||||
|
||||
Args:
|
||||
batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
|
||||
provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
|
||||
api_token (Optional[str]): The API token for authentication. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: A list of extracted content blocks from all batch items.
|
||||
"""
|
||||
|
||||
api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
|
||||
from litellm import batch_completion
|
||||
messages = []
|
||||
@@ -986,6 +1193,25 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
|
||||
return merged_sections
|
||||
|
||||
def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
|
||||
"""
|
||||
Process sections of HTML content sequentially or in parallel.
|
||||
|
||||
How it works:
|
||||
1. Sequentially processes sections with delays for "groq/" providers.
|
||||
2. Uses ThreadPoolExecutor for parallel processing with other providers.
|
||||
3. Extracts content blocks for each section.
|
||||
|
||||
Args:
|
||||
url (str): The website URL.
|
||||
sections (List[str]): The list of HTML sections to process.
|
||||
provider (str): The AI provider for content extraction.
|
||||
api_token (str): The API token for authentication.
|
||||
base_url (Optional[str]): The base URL for the API. Defaults to None.
|
||||
|
||||
Returns:
|
||||
List[dict]: The list of extracted content blocks from all sections.
|
||||
"""
|
||||
|
||||
extracted_content = []
|
||||
if provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
@@ -1002,6 +1228,24 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba
|
||||
return extracted_content
|
||||
|
||||
def wrap_text(draw, text, font, max_width):
|
||||
"""
|
||||
Wrap text to fit within a specified width for rendering.
|
||||
|
||||
How it works:
|
||||
1. Splits the text into words.
|
||||
2. Constructs lines that fit within the maximum width using the provided font.
|
||||
3. Returns the wrapped text as a single string.
|
||||
|
||||
Args:
|
||||
draw (ImageDraw.Draw): The drawing context for measuring text size.
|
||||
text (str): The text to wrap.
|
||||
font (ImageFont.FreeTypeFont): The font to use for measuring text size.
|
||||
max_width (int): The maximum width for each line.
|
||||
|
||||
Returns:
|
||||
str: The wrapped text.
|
||||
"""
|
||||
|
||||
# Wrap the text to fit within the specified width
|
||||
lines = []
|
||||
words = text.split()
|
||||
@@ -1013,6 +1257,21 @@ def wrap_text(draw, text, font, max_width):
|
||||
return '\n'.join(lines)
|
||||
|
||||
def format_html(html_string):
|
||||
"""
|
||||
Prettify an HTML string using BeautifulSoup.
|
||||
|
||||
How it works:
|
||||
1. Parses the HTML string with BeautifulSoup.
|
||||
2. Formats the HTML with proper indentation.
|
||||
3. Returns the prettified HTML string.
|
||||
|
||||
Args:
|
||||
html_string (str): The HTML string to format.
|
||||
|
||||
Returns:
|
||||
str: The prettified HTML string.
|
||||
"""
|
||||
|
||||
soup = BeautifulSoup(html_string, 'lxml.parser')
|
||||
return soup.prettify()
|
||||
|
||||
@@ -1110,7 +1369,20 @@ def normalize_url_tmp(href, base_url):
|
||||
return href.strip()
|
||||
|
||||
def get_base_domain(url: str) -> str:
|
||||
"""Extract base domain from URL, handling various edge cases."""
|
||||
"""
|
||||
Extract the base domain from a given URL, handling common edge cases.
|
||||
|
||||
How it works:
|
||||
1. Parses the URL to extract the domain.
|
||||
2. Removes the port number and 'www' prefix.
|
||||
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
|
||||
|
||||
Args:
|
||||
url (str): The URL to extract the base domain from.
|
||||
|
||||
Returns:
|
||||
str: The extracted base domain or an empty string if parsing fails.
|
||||
"""
|
||||
try:
|
||||
# Get domain from URL
|
||||
domain = urlparse(url).netloc.lower()
|
||||
@@ -1136,7 +1408,20 @@ def get_base_domain(url: str) -> str:
|
||||
return ""
|
||||
|
||||
def is_external_url(url: str, base_domain: str) -> bool:
|
||||
"""Check if URL is external to base domain."""
|
||||
"""
|
||||
Extract the base domain from a given URL, handling common edge cases.
|
||||
|
||||
How it works:
|
||||
1. Parses the URL to extract the domain.
|
||||
2. Removes the port number and 'www' prefix.
|
||||
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
|
||||
|
||||
Args:
|
||||
url (str): The URL to extract the base domain from.
|
||||
|
||||
Returns:
|
||||
str: The extracted base domain or an empty string if parsing fails.
|
||||
"""
|
||||
special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
|
||||
if any(url.lower().startswith(p) for p in special):
|
||||
return True
|
||||
@@ -1155,8 +1440,22 @@ def is_external_url(url: str, base_domain: str) -> bool:
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def clean_tokens(tokens: list[str]) -> list[str]:
|
||||
"""
|
||||
Clean a list of tokens by removing noise, stop words, and short tokens.
|
||||
|
||||
How it works:
|
||||
1. Defines a set of noise words and stop words.
|
||||
2. Filters tokens based on length and exclusion criteria.
|
||||
3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").
|
||||
|
||||
Args:
|
||||
tokens (list[str]): The list of tokens to clean.
|
||||
|
||||
Returns:
|
||||
list[str]: The cleaned list of tokens.
|
||||
"""
|
||||
|
||||
# Set of tokens to remove
|
||||
noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'}
|
||||
|
||||
@@ -1212,6 +1511,21 @@ def clean_tokens(tokens: list[str]) -> list[str]:
|
||||
and not token.startswith('⬆')]
|
||||
|
||||
def profile_and_time(func):
|
||||
"""
|
||||
Decorator to profile a function's execution time and performance.
|
||||
|
||||
How it works:
|
||||
1. Records the start time before executing the function.
|
||||
2. Profiles the function's execution using `cProfile`.
|
||||
3. Prints the elapsed time and profiling statistics.
|
||||
|
||||
Args:
|
||||
func (Callable): The function to decorate.
|
||||
|
||||
Returns:
|
||||
Callable: The decorated function with profiling and timing enabled.
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrapper(self, *args, **kwargs):
|
||||
# Start timer
|
||||
|
||||
114
docs/examples/amazon_product_extraction_direct_url.py
Normal file
114
docs/examples/amazon_product_extraction_direct_url.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy
|
||||
crawler_config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/s?k=Samsung+Galaxy+Tab"
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
145
docs/examples/amazon_product_extraction_using_hooks.py
Normal file
145
docs/examples/amazon_product_extraction_using_hooks.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Initialize crawler config with JSON CSS extraction strategy nav-search-submit-button
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
async def after_goto(page: Page, context: BrowserContext, url: str, response: dict, **kwargs):
|
||||
"""Hook called after navigating to each URL"""
|
||||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||||
|
||||
try:
|
||||
# Wait for search box to be available
|
||||
search_box = await page.wait_for_selector('#twotabsearchtextbox', timeout=1000)
|
||||
|
||||
# Type the search query
|
||||
await search_box.fill('Samsung Galaxy Tab')
|
||||
|
||||
# Get the search button and prepare for navigation
|
||||
search_button = await page.wait_for_selector('#nav-search-submit-button', timeout=1000)
|
||||
|
||||
# Click with navigation waiting
|
||||
await search_button.click()
|
||||
|
||||
# Wait for search results to load
|
||||
await page.wait_for_selector('[data-component-type="s-search-result"]', timeout=10000)
|
||||
print("[HOOK] Search completed and results loaded!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[HOOK] Error during search operation: {str(e)}")
|
||||
|
||||
return page
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
|
||||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||||
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
129
docs/examples/amazon_product_extraction_using_use_javascript.py
Normal file
129
docs/examples/amazon_product_extraction_using_use_javascript.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
This example demonstrates how to use JSON CSS extraction to scrape product information
|
||||
from Amazon search results. It shows how to extract structured data like product titles,
|
||||
prices, ratings, and other details using CSS selectors.
|
||||
"""
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import json
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
async def extract_amazon_products():
|
||||
# Initialize browser config
|
||||
browser_config = BrowserConfig(
|
||||
# browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
|
||||
js_code_to_search = """
|
||||
const task = async () => {
|
||||
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
|
||||
document.querySelector('#nav-search-submit-button').click();
|
||||
}
|
||||
await task();
|
||||
"""
|
||||
js_code_to_search_sync = """
|
||||
document.querySelector('#twotabsearchtextbox').value = 'Samsung Galaxy Tab';
|
||||
document.querySelector('#nav-search-submit-button').click();
|
||||
"""
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code = js_code_to_search,
|
||||
wait_for='css:[data-component-type="s-search-result"]',
|
||||
extraction_strategy=JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Amazon Product Search Results",
|
||||
"baseSelector": "[data-component-type='s-search-result']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "asin",
|
||||
"selector": "",
|
||||
"type": "attribute",
|
||||
"attribute": "data-asin"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2 a span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": "h2 a",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
},
|
||||
{
|
||||
"name": "image",
|
||||
"selector": ".s-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": ".a-icon-star-small .a-icon-alt",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "reviews_count",
|
||||
"selector": "[data-csa-c-func-deps='aui-da-a-popover'] ~ span span",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".a-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "original_price",
|
||||
"selector": ".a-price.a-text-price .a-offscreen",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "sponsored",
|
||||
"selector": ".puis-sponsored-label-text",
|
||||
"type": "exists"
|
||||
},
|
||||
{
|
||||
"name": "delivery_info",
|
||||
"selector": "[data-cy='delivery-recipe'] .a-color-base",
|
||||
"type": "text",
|
||||
"multiple": True
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
# Example search URL (you should replace with your actual Amazon URL)
|
||||
url = "https://www.amazon.com/"
|
||||
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract the data
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
# Process and print the results
|
||||
if result and result.extracted_content:
|
||||
# Parse the JSON string into a list of products
|
||||
products = json.loads(result.extracted_content)
|
||||
|
||||
# Process each product in the list
|
||||
for product in products:
|
||||
print("\nProduct Details:")
|
||||
print(f"ASIN: {product.get('asin')}")
|
||||
print(f"Title: {product.get('title')}")
|
||||
print(f"Price: {product.get('price')}")
|
||||
print(f"Original Price: {product.get('original_price')}")
|
||||
print(f"Rating: {product.get('rating')}")
|
||||
print(f"Reviews: {product.get('reviews_count')}")
|
||||
print(f"Sponsored: {'Yes' if product.get('sponsored') else 'No'}")
|
||||
if product.get('delivery_info'):
|
||||
print(f"Delivery: {' '.join(product['delivery_info'])}")
|
||||
print("-" * 80)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(extract_amazon_products())
|
||||
@@ -1,6 +1,8 @@
|
||||
import os, sys
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
os.environ['FIRECRAWL_API_KEY'] = "fc-84b370ccfad44beabc686b38f1769692"
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
)
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
@@ -12,7 +14,10 @@ from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
)
|
||||
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
|
||||
@@ -21,128 +26,182 @@ print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
||||
print("Twitter: @unclecode")
|
||||
print("Website: https://crawl4ai.com")
|
||||
|
||||
|
||||
# Basic Example - Simple Crawl
|
||||
async def simple_crawl():
|
||||
print("\n--- Basic Usage ---")
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
async def clean_content():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
excluded_tags=["nav", "footer", "aside"],
|
||||
remove_overlay_elements=True,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
),
|
||||
options={"ignore_links": True},
|
||||
),
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
config=crawler_config,
|
||||
)
|
||||
full_markdown_length = len(result.markdown_v2.raw_markdown)
|
||||
fit_markdown_length = len(result.markdown_v2.fit_markdown)
|
||||
print(f"Full Markdown Length: {full_markdown_length}")
|
||||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||||
|
||||
async def link_analysis():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
exclude_external_links=True,
|
||||
exclude_social_media_links=True,
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config,
|
||||
)
|
||||
print(f"Found {len(result.links['internal'])} internal links")
|
||||
print(f"Found {len(result.links['external'])} external links")
|
||||
|
||||
for link in result.links['internal'][:5]:
|
||||
print(f"Href: {link['href']}\nText: {link['text']}\n")
|
||||
|
||||
# JavaScript Execution Example
|
||||
async def simple_example_with_running_js_code():
|
||||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
java_script_enabled=True
|
||||
)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
js_code=["const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"],
|
||||
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
|
||||
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
|
||||
)
|
||||
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# CSS Selector Example
|
||||
async def simple_example_with_css_selector():
|
||||
print("\n--- Using CSS Selectors ---")
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
css_selector=".wide-tease-item__description"
|
||||
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
|
||||
)
|
||||
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
async def media_handling():
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
for img in result.media['images'][:5]:
|
||||
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
|
||||
|
||||
async def custom_hook_workflow(verbose=True):
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Set a 'before_goto' hook to run custom code just before navigation
|
||||
crawler.crawler_strategy.set_hook("before_goto", lambda page, context: print("[Hook] Preparing to navigate..."))
|
||||
|
||||
# Perform the crawl operation
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com"
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500].replace("\n", " -- "))
|
||||
|
||||
|
||||
# Proxy Example
|
||||
async def use_proxy():
|
||||
print("\n--- Using a Proxy ---")
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
proxy="http://your-proxy-url:port"
|
||||
proxy_config={
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "username",
|
||||
"password": "password",
|
||||
},
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=crawler_config
|
||||
url="https://www.nbcnews.com/business", config=crawler_config
|
||||
)
|
||||
if result.success:
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# Screenshot Example
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
with open(output_path, 'wb') as f:
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print(f"Screenshot saved successfully to {output_path}")
|
||||
else:
|
||||
print("Failed to capture screenshot")
|
||||
|
||||
|
||||
# LLM Extraction Example
|
||||
class OpenAIModelFee(BaseModel):
|
||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||
output_fee: str = Field(
|
||||
..., description="Fee for output token for the OpenAI model."
|
||||
)
|
||||
|
||||
async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
|
||||
|
||||
async def extract_structured_data_using_llm(
|
||||
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
|
||||
):
|
||||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||||
|
||||
|
||||
if api_token is None and provider != "ollama":
|
||||
print(f"API token is required for {provider}. Skipping this example.")
|
||||
return
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
extra_args = {
|
||||
"temperature": 0,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 2000
|
||||
}
|
||||
|
||||
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
||||
if extra_headers:
|
||||
extra_args["extra_headers"] = extra_headers
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=1,
|
||||
page_timeout = 80000,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider=provider,
|
||||
api_token=api_token,
|
||||
@@ -150,17 +209,17 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
Do not miss any models in the entire content.""",
|
||||
extra_args=extra_args
|
||||
)
|
||||
extra_args=extra_args,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://openai.com/api/pricing/",
|
||||
config=crawler_config
|
||||
url="https://openai.com/api/pricing/", config=crawler_config
|
||||
)
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
# CSS Extraction Example
|
||||
async def extract_structured_data_using_css_extractor():
|
||||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||||
@@ -192,16 +251,13 @@ async def extract_structured_data_using_css_extractor():
|
||||
"name": "course_icon",
|
||||
"selector": ".image-92",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
}
|
||||
]
|
||||
"attribute": "src",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
java_script_enabled=True
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||||
|
||||
js_click_tabs = """
|
||||
(async () => {
|
||||
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
||||
@@ -212,23 +268,23 @@ async def extract_structured_data_using_css_extractor():
|
||||
}
|
||||
})();
|
||||
"""
|
||||
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
js_code=[js_click_tabs]
|
||||
js_code=[js_click_tabs],
|
||||
)
|
||||
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.kidocode.com/degrees/technology",
|
||||
config=crawler_config
|
||||
url="https://www.kidocode.com/degrees/technology", config=crawler_config
|
||||
)
|
||||
|
||||
companies = json.loads(result.extracted_content)
|
||||
print(f"Successfully extracted {len(companies)} companies")
|
||||
print(json.dumps(companies[0], indent=2))
|
||||
|
||||
|
||||
# Dynamic Content Examples - Method 1
|
||||
async def crawl_dynamic_content_pages_method_1():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
@@ -249,10 +305,7 @@ async def crawl_dynamic_content_pages_method_1():
|
||||
except Exception as e:
|
||||
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
java_script_enabled=True
|
||||
)
|
||||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||||
@@ -272,7 +325,7 @@ async def crawl_dynamic_content_pages_method_1():
|
||||
css_selector="li.Box-sc-g0xbh4-0",
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
session_id=session_id
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
@@ -286,14 +339,12 @@ async def crawl_dynamic_content_pages_method_1():
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
# Dynamic Content Examples - Method 2
|
||||
async def crawl_dynamic_content_pages_method_2():
|
||||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
java_script_enabled=True
|
||||
)
|
||||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||||
|
||||
js_next_page_and_wait = """
|
||||
(async () => {
|
||||
@@ -343,7 +394,7 @@ async def crawl_dynamic_content_pages_method_2():
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page_and_wait if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
session_id=session_id
|
||||
session_id=session_id,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
@@ -355,88 +406,128 @@ async def crawl_dynamic_content_pages_method_2():
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
|
||||
|
||||
async def cosine_similarity_extraction():
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=CosineStrategy(
|
||||
word_count_threshold=10,
|
||||
max_dist=0.2, # Maximum distance between two words
|
||||
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
||||
top_k=3, # Number of top keywords to extract
|
||||
sim_threshold=0.3, # Similarity threshold for clustering
|
||||
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
||||
verbose=True
|
||||
),
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
|
||||
config=crawl_config
|
||||
)
|
||||
print(json.loads(result.extracted_content)[:5])
|
||||
|
||||
# Browser Comparison
|
||||
async def crawl_custom_browser_type():
|
||||
print("\n--- Browser Comparison ---")
|
||||
|
||||
|
||||
# Firefox
|
||||
browser_config_firefox = BrowserConfig(
|
||||
browser_type="firefox",
|
||||
headless=True
|
||||
)
|
||||
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("Firefox:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
# WebKit
|
||||
browser_config_webkit = BrowserConfig(
|
||||
browser_type="webkit",
|
||||
headless=True
|
||||
)
|
||||
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("WebKit:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
# Chromium (default)
|
||||
browser_config_chromium = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True
|
||||
)
|
||||
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
|
||||
start = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||||
)
|
||||
print("Chromium:", time.time() - start)
|
||||
print(result.markdown[:500])
|
||||
|
||||
|
||||
# Anti-Bot and User Simulation
|
||||
async def crawl_with_user_simulation():
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={
|
||||
"device_type": "mobile",
|
||||
"os_type": "android"
|
||||
}
|
||||
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
override_navigator=True
|
||||
override_navigator=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="YOUR-URL-HERE",
|
||||
config=crawler_config
|
||||
)
|
||||
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
|
||||
print(result.markdown)
|
||||
|
||||
async def ssl_certification():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS # Bypass cache to always get fresh certificates
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
cert = result.ssl_certificate
|
||||
|
||||
# 1. Access certificate properties directly
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||||
print(f"Valid until: {cert.valid_until}")
|
||||
print(f"Fingerprint: {cert.fingerprint}")
|
||||
|
||||
# 2. Export certificate in different formats
|
||||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||||
print("\nCertificate exported to:")
|
||||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||||
|
||||
pem_data = cert.to_pem(os.path.join(tmp_dir, "certificate.pem")) # For web servers
|
||||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||||
|
||||
der_data = cert.to_der(os.path.join(tmp_dir, "certificate.der")) # For Java apps
|
||||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||||
|
||||
# Speed Comparison
|
||||
async def speed_comparison():
|
||||
print("\n--- Speed Comparison ---")
|
||||
|
||||
|
||||
# Firecrawl comparison
|
||||
from firecrawl import FirecrawlApp
|
||||
app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
|
||||
|
||||
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
|
||||
start = time.time()
|
||||
scrape_status = app.scrape_url(
|
||||
'https://www.nbcnews.com/business',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
|
||||
)
|
||||
end = time.time()
|
||||
print("Firecrawl:")
|
||||
@@ -447,16 +538,15 @@ async def speed_comparison():
|
||||
|
||||
# Crawl4AI comparisons
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
|
||||
# Simple crawl
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
start = time.time()
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=0
|
||||
)
|
||||
cache_mode=CacheMode.BYPASS, word_count_threshold=0
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (simple crawl):")
|
||||
@@ -474,12 +564,10 @@ async def speed_comparison():
|
||||
word_count_threshold=0,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
),
|
||||
)
|
||||
end = time.time()
|
||||
print("Crawl4AI (Markdown Plus):")
|
||||
@@ -489,22 +577,25 @@ async def speed_comparison():
|
||||
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
|
||||
print()
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# Basic examples
|
||||
# await simple_crawl()
|
||||
# await simple_example_with_running_js_code()
|
||||
# await simple_example_with_css_selector()
|
||||
|
||||
|
||||
# Advanced examples
|
||||
# await extract_structured_data_using_css_extractor()
|
||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||
await extract_structured_data_using_llm(
|
||||
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
||||
)
|
||||
# await crawl_dynamic_content_pages_method_1()
|
||||
# await crawl_dynamic_content_pages_method_2()
|
||||
|
||||
|
||||
# Browser comparisons
|
||||
# await crawl_custom_browser_type()
|
||||
|
||||
|
||||
# Performance testing
|
||||
# await speed_comparison()
|
||||
|
||||
@@ -514,5 +605,6 @@ async def main():
|
||||
# os.path.join(__location__, "tmp/example_screenshot.jpg")
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -627,13 +627,13 @@ async def main():
|
||||
# }
|
||||
# await extract_structured_data_using_llm(extra_headers=custom_headers)
|
||||
|
||||
await crawl_dynamic_content_pages_method_1()
|
||||
await crawl_dynamic_content_pages_method_2()
|
||||
# await crawl_dynamic_content_pages_method_1()
|
||||
# await crawl_dynamic_content_pages_method_2()
|
||||
await crawl_dynamic_content_pages_method_3()
|
||||
|
||||
await crawl_custom_browser_type()
|
||||
# await crawl_custom_browser_type()
|
||||
|
||||
await speed_comparison()
|
||||
# await speed_comparison()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,281 +0,0 @@
|
||||
from openai import AsyncOpenAI
|
||||
from chainlit.types import ThreadDict
|
||||
import chainlit as cl
|
||||
from chainlit.input_widget import Select, Switch, Slider
|
||||
client = AsyncOpenAI()
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
|
||||
settings = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"temperature": 0.5,
|
||||
"max_tokens": 500,
|
||||
"top_p": 1,
|
||||
"frequency_penalty": 0,
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
@cl.action_callback("action_button")
|
||||
async def on_action(action: cl.Action):
|
||||
print("The user clicked on the action button!")
|
||||
|
||||
return "Thank you for clicking on the action button!"
|
||||
|
||||
@cl.set_chat_profiles
|
||||
async def chat_profile():
|
||||
return [
|
||||
cl.ChatProfile(
|
||||
name="GPT-3.5",
|
||||
markdown_description="The underlying LLM model is **GPT-3.5**.",
|
||||
icon="https://picsum.photos/200",
|
||||
),
|
||||
cl.ChatProfile(
|
||||
name="GPT-4",
|
||||
markdown_description="The underlying LLM model is **GPT-4**.",
|
||||
icon="https://picsum.photos/250",
|
||||
),
|
||||
]
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
|
||||
settings = await cl.ChatSettings(
|
||||
[
|
||||
Select(
|
||||
id="Model",
|
||||
label="OpenAI - Model",
|
||||
values=["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4", "gpt-4-32k"],
|
||||
initial_index=0,
|
||||
),
|
||||
Switch(id="Streaming", label="OpenAI - Stream Tokens", initial=True),
|
||||
Slider(
|
||||
id="Temperature",
|
||||
label="OpenAI - Temperature",
|
||||
initial=1,
|
||||
min=0,
|
||||
max=2,
|
||||
step=0.1,
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Steps",
|
||||
label="Stability AI - Steps",
|
||||
initial=30,
|
||||
min=10,
|
||||
max=150,
|
||||
step=1,
|
||||
description="Amount of inference steps performed on image generation.",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Cfg_Scale",
|
||||
label="Stability AI - Cfg_Scale",
|
||||
initial=7,
|
||||
min=1,
|
||||
max=35,
|
||||
step=0.1,
|
||||
description="Influences how strongly your generation is guided to match your prompt.",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Width",
|
||||
label="Stability AI - Image Width",
|
||||
initial=512,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=64,
|
||||
tooltip="Measured in pixels",
|
||||
),
|
||||
Slider(
|
||||
id="SAI_Height",
|
||||
label="Stability AI - Image Height",
|
||||
initial=512,
|
||||
min=256,
|
||||
max=2048,
|
||||
step=64,
|
||||
tooltip="Measured in pixels",
|
||||
),
|
||||
]
|
||||
).send()
|
||||
|
||||
chat_profile = cl.user_session.get("chat_profile")
|
||||
await cl.Message(
|
||||
content=f"starting chat using the {chat_profile} chat profile"
|
||||
).send()
|
||||
|
||||
print("A new chat session has started!")
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": []
|
||||
})
|
||||
|
||||
image = cl.Image(url="https://c.tenor.com/uzWDSSLMCmkAAAAd/tenor.gif", name="cat image", display="inline")
|
||||
|
||||
# Attach the image to the message
|
||||
await cl.Message(
|
||||
content="You are such a good girl, aren't you?!",
|
||||
elements=[image],
|
||||
).send()
|
||||
|
||||
text_content = "Hello, this is a text element."
|
||||
elements = [
|
||||
cl.Text(name="simple_text", content=text_content, display="inline")
|
||||
]
|
||||
|
||||
await cl.Message(
|
||||
content="Check out this text element!",
|
||||
elements=elements,
|
||||
).send()
|
||||
|
||||
elements = [
|
||||
cl.Audio(path="./assets/audio.mp3", display="inline"),
|
||||
]
|
||||
await cl.Message(
|
||||
content="Here is an audio file",
|
||||
elements=elements,
|
||||
).send()
|
||||
|
||||
await cl.Avatar(
|
||||
name="Tool 1",
|
||||
url="https://avatars.githubusercontent.com/u/128686189?s=400&u=a1d1553023f8ea0921fba0debbe92a8c5f840dd9&v=4",
|
||||
).send()
|
||||
|
||||
await cl.Message(
|
||||
content="This message should not have an avatar!", author="Tool 0"
|
||||
).send()
|
||||
|
||||
await cl.Message(
|
||||
content="This message should have an avatar!", author="Tool 1"
|
||||
).send()
|
||||
|
||||
elements = [
|
||||
cl.File(
|
||||
name="quickstart.py",
|
||||
path="./quickstart.py",
|
||||
display="inline",
|
||||
),
|
||||
]
|
||||
|
||||
await cl.Message(
|
||||
content="This message has a file element", elements=elements
|
||||
).send()
|
||||
|
||||
# Sending an action button within a chatbot message
|
||||
actions = [
|
||||
cl.Action(name="action_button", value="example_value", description="Click me!")
|
||||
]
|
||||
|
||||
await cl.Message(content="Interact with this action button:", actions=actions).send()
|
||||
|
||||
# res = await cl.AskActionMessage(
|
||||
# content="Pick an action!",
|
||||
# actions=[
|
||||
# cl.Action(name="continue", value="continue", label="✅ Continue"),
|
||||
# cl.Action(name="cancel", value="cancel", label="❌ Cancel"),
|
||||
# ],
|
||||
# ).send()
|
||||
|
||||
# if res and res.get("value") == "continue":
|
||||
# await cl.Message(
|
||||
# content="Continue!",
|
||||
# ).send()
|
||||
|
||||
# import plotly.graph_objects as go
|
||||
# fig = go.Figure(
|
||||
# data=[go.Bar(y=[2, 1, 3])],
|
||||
# layout_title_text="An example figure",
|
||||
# )
|
||||
# elements = [cl.Plotly(name="chart", figure=fig, display="inline")]
|
||||
|
||||
# await cl.Message(content="This message has a chart", elements=elements).send()
|
||||
|
||||
# Sending a pdf with the local file path
|
||||
# elements = [
|
||||
# cl.Pdf(name="pdf1", display="inline", path="./pdf1.pdf")
|
||||
# ]
|
||||
|
||||
# cl.Message(content="Look at this local pdf!", elements=elements).send()
|
||||
|
||||
@cl.on_settings_update
|
||||
async def setup_agent(settings):
|
||||
print("on_settings_update", settings)
|
||||
|
||||
@cl.on_stop
|
||||
def on_stop():
|
||||
print("The user wants to stop the task!")
|
||||
|
||||
@cl.on_chat_end
|
||||
def on_chat_end():
|
||||
print("The user disconnected!")
|
||||
|
||||
|
||||
@cl.on_chat_resume
|
||||
async def on_chat_resume(thread: ThreadDict):
|
||||
print("The user resumed a previous chat session!")
|
||||
|
||||
|
||||
|
||||
|
||||
# @cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
response = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"content": "You are a helpful bot",
|
||||
"role": "system"
|
||||
},
|
||||
*cl.user_session.get("session")["history"]
|
||||
],
|
||||
**settings
|
||||
)
|
||||
|
||||
|
||||
# Add assitanr message to the history
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "assistant",
|
||||
"content": response.choices[0].message.content
|
||||
})
|
||||
|
||||
# msg.content = response.choices[0].message.content
|
||||
# await msg.update()
|
||||
|
||||
# await cl.Message(content=response.choices[0].message.content).send()
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"content": "You are a helpful bot",
|
||||
"role": "system"
|
||||
},
|
||||
*cl.user_session.get("session")["history"]
|
||||
],
|
||||
stream = True,
|
||||
**settings
|
||||
)
|
||||
|
||||
async for part in stream:
|
||||
if token := part.choices[0].delta.content or "":
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assitanr message to the history
|
||||
cl.user_session.get("session")["history"].append({
|
||||
"role": "assistant",
|
||||
"content": msg.content
|
||||
})
|
||||
await msg.update()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
@@ -1,238 +0,0 @@
|
||||
# Make sure to install the required packageschainlit and groq
|
||||
import os, time
|
||||
from openai import AsyncOpenAI
|
||||
import chainlit as cl
|
||||
import re
|
||||
import requests
|
||||
from io import BytesIO
|
||||
from chainlit.element import ElementBased
|
||||
from groq import Groq
|
||||
|
||||
# Import threadpools to run the crawl_url function in a separate thread
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
client = AsyncOpenAI(base_url="https://api.groq.com/openai/v1", api_key=os.getenv("GROQ_API_KEY"))
|
||||
|
||||
# Instrument the OpenAI client
|
||||
cl.instrument_openai()
|
||||
|
||||
settings = {
|
||||
"model": "llama3-8b-8192",
|
||||
"temperature": 0.5,
|
||||
"max_tokens": 500,
|
||||
"top_p": 1,
|
||||
"frequency_penalty": 0,
|
||||
"presence_penalty": 0,
|
||||
}
|
||||
|
||||
def extract_urls(text):
|
||||
url_pattern = re.compile(r'(https?://\S+)')
|
||||
return url_pattern.findall(text)
|
||||
|
||||
def crawl_url(url):
|
||||
data = {
|
||||
"urls": [url],
|
||||
"include_raw_html": True,
|
||||
"word_count_threshold": 10,
|
||||
"extraction_strategy": "NoExtractionStrategy",
|
||||
"chunking_strategy": "RegexChunking"
|
||||
}
|
||||
response = requests.post("https://crawl4ai.com/crawl", json=data)
|
||||
response_data = response.json()
|
||||
response_data = response_data['results'][0]
|
||||
return response_data['markdown']
|
||||
|
||||
@cl.on_chat_start
|
||||
async def on_chat_start():
|
||||
cl.user_session.set("session", {
|
||||
"history": [],
|
||||
"context": {}
|
||||
})
|
||||
await cl.Message(
|
||||
content="Welcome to the chat! How can I assist you today?"
|
||||
).send()
|
||||
|
||||
@cl.on_message
|
||||
async def on_message(message: cl.Message):
|
||||
user_session = cl.user_session.get("session")
|
||||
|
||||
# Extract URLs from the user's message
|
||||
urls = extract_urls(message.content)
|
||||
|
||||
|
||||
futures = []
|
||||
with ThreadPoolExecutor() as executor:
|
||||
for url in urls:
|
||||
futures.append(executor.submit(crawl_url, url))
|
||||
|
||||
results = [future.result() for future in futures]
|
||||
|
||||
for url, result in zip(urls, results):
|
||||
ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
user_session["context"][ref_number] = {
|
||||
"url": url,
|
||||
"content": result
|
||||
}
|
||||
|
||||
# for url in urls:
|
||||
# # Crawl the content of each URL and add it to the session context with a reference number
|
||||
# ref_number = f"REF_{len(user_session['context']) + 1}"
|
||||
# crawled_content = crawl_url(url)
|
||||
# user_session["context"][ref_number] = {
|
||||
# "url": url,
|
||||
# "content": crawled_content
|
||||
# }
|
||||
|
||||
user_session["history"].append({
|
||||
"role": "user",
|
||||
"content": message.content
|
||||
})
|
||||
|
||||
# Create a system message that includes the context
|
||||
context_messages = [
|
||||
f'<appendix ref="{ref}">\n{data["content"]}\n</appendix>'
|
||||
for ref, data in user_session["context"].items()
|
||||
]
|
||||
if context_messages:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You are a helpful bot. Use the following context for answering questions. "
|
||||
"Refer to the sources using the REF number in square brackets, e.g., [1], only if the source is given in the appendices below.\n\n"
|
||||
"If the question requires any information from the provided appendices or context, refer to the sources. "
|
||||
"If not, there is no need to add a references section. "
|
||||
"At the end of your response, provide a reference section listing the URLs and their REF numbers only if sources from the appendices were used.\n\n"
|
||||
"\n\n".join(context_messages)
|
||||
)
|
||||
}
|
||||
else:
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
}
|
||||
|
||||
|
||||
msg = cl.Message(content="")
|
||||
await msg.send()
|
||||
|
||||
# Get response from the LLM
|
||||
stream = await client.chat.completions.create(
|
||||
messages=[
|
||||
system_message,
|
||||
*user_session["history"]
|
||||
],
|
||||
stream=True,
|
||||
**settings
|
||||
)
|
||||
|
||||
assistant_response = ""
|
||||
async for part in stream:
|
||||
if token := part.choices[0].delta.content:
|
||||
assistant_response += token
|
||||
await msg.stream_token(token)
|
||||
|
||||
# Add assistant message to the history
|
||||
user_session["history"].append({
|
||||
"role": "assistant",
|
||||
"content": assistant_response
|
||||
})
|
||||
await msg.update()
|
||||
|
||||
# Append the reference section to the assistant's response
|
||||
reference_section = "\n\nReferences:\n"
|
||||
for ref, data in user_session["context"].items():
|
||||
reference_section += f"[{ref.split('_')[1]}]: {data['url']}\n"
|
||||
|
||||
msg.content += reference_section
|
||||
await msg.update()
|
||||
|
||||
|
||||
@cl.on_audio_chunk
|
||||
async def on_audio_chunk(chunk: cl.AudioChunk):
|
||||
if chunk.isStart:
|
||||
buffer = BytesIO()
|
||||
# This is required for whisper to recognize the file type
|
||||
buffer.name = f"input_audio.{chunk.mimeType.split('/')[1]}"
|
||||
# Initialize the session for a new audio stream
|
||||
cl.user_session.set("audio_buffer", buffer)
|
||||
cl.user_session.set("audio_mime_type", chunk.mimeType)
|
||||
|
||||
# Write the chunks to a buffer and transcribe the whole audio at the end
|
||||
cl.user_session.get("audio_buffer").write(chunk.data)
|
||||
|
||||
pass
|
||||
|
||||
@cl.step(type="tool")
|
||||
async def speech_to_text(audio_file):
|
||||
cli = Groq()
|
||||
|
||||
# response = cli.audio.transcriptions.create(
|
||||
# file=audio_file, #(filename, file.read()),
|
||||
# model="whisper-large-v3",
|
||||
# )
|
||||
|
||||
response = await client.audio.transcriptions.create(
|
||||
model="whisper-large-v3", file=audio_file
|
||||
)
|
||||
|
||||
return response.text
|
||||
|
||||
|
||||
@cl.on_audio_end
|
||||
async def on_audio_end(elements: list[ElementBased]):
|
||||
# Get the audio buffer from the session
|
||||
audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
|
||||
audio_buffer.seek(0) # Move the file pointer to the beginning
|
||||
audio_file = audio_buffer.read()
|
||||
audio_mime_type: str = cl.user_session.get("audio_mime_type")
|
||||
|
||||
# input_audio_el = cl.Audio(
|
||||
# mime=audio_mime_type, content=audio_file, name=audio_buffer.name
|
||||
# )
|
||||
# await cl.Message(
|
||||
# author="You",
|
||||
# type="user_message",
|
||||
# content="",
|
||||
# elements=[input_audio_el, *elements]
|
||||
# ).send()
|
||||
|
||||
# answer_message = await cl.Message(content="").send()
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
whisper_input = (audio_buffer.name, audio_file, audio_mime_type)
|
||||
transcription = await speech_to_text(whisper_input)
|
||||
end_time = time.time()
|
||||
print(f"Transcription took {end_time - start_time} seconds")
|
||||
|
||||
user_msg = cl.Message(
|
||||
author="You",
|
||||
type="user_message",
|
||||
content=transcription
|
||||
)
|
||||
await user_msg.send()
|
||||
await on_message(user_msg)
|
||||
|
||||
# images = [file for file in elements if "image" in file.mime]
|
||||
|
||||
# text_answer = await generate_text_answer(transcription, images)
|
||||
|
||||
# output_name, output_audio = await text_to_speech(text_answer, audio_mime_type)
|
||||
|
||||
# output_audio_el = cl.Audio(
|
||||
# name=output_name,
|
||||
# auto_play=True,
|
||||
# mime=audio_mime_type,
|
||||
# content=output_audio,
|
||||
# )
|
||||
|
||||
# answer_message.elements = [output_audio_el]
|
||||
|
||||
# answer_message.content = transcription
|
||||
# await answer_message.update()
|
||||
|
||||
if __name__ == "__main__":
|
||||
from chainlit.cli import run_chainlit
|
||||
run_chainlit(__file__)
|
||||
|
||||
|
||||
@@ -2,20 +2,39 @@
|
||||
|
||||
Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extraction into AI-ready formats. Perfect for AI assistants, semantic search engines, or data pipelines, Crawl4AI transforms raw HTML into structured Markdown or JSON effortlessly. Integrate with LLMs, open-source models, or your own retrieval-augmented generation workflows.
|
||||
|
||||
**Key Links:**
|
||||
- **Website:** [https://crawl4ai.com](https://crawl4ai.com)
|
||||
- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
|
||||
- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py)
|
||||
- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
|
||||
**What Crawl4AI is not:**
|
||||
|
||||
Crawl4AI is not a replacement for traditional web scraping libraries, Selenium, or Playwright. It's not designed as a general-purpose web automation tool. Instead, Crawl4AI has a specific, focused goal:
|
||||
|
||||
- To generate perfect, AI-friendly data (particularly for LLMs) from web content
|
||||
- To maximize speed and efficiency in data extraction and processing
|
||||
- To operate at scale, from Raspberry Pi to cloud infrastructures
|
||||
|
||||
Crawl4AI is engineered with a "scale-first" mindset, aiming to handle millions of links while maintaining exceptional performance. It's super efficient and fast, optimized to:
|
||||
|
||||
1. Transform raw web content into structured, LLM-ready formats (Markdown/JSON)
|
||||
2. Implement intelligent extraction strategies to reduce reliance on costly API calls
|
||||
3. Provide a streamlined pipeline for AI data preparation and ingestion
|
||||
|
||||
In essence, Crawl4AI bridges the gap between web content and AI systems, focusing on delivering high-quality, processed data rather than offering broad web automation capabilities.
|
||||
|
||||
**Key Links:**
|
||||
|
||||
- **Website:** [https://crawl4ai.com](https://crawl4ai.com)
|
||||
- **GitHub:** [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- **Colab Notebook:** [Try on Google Colab](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
|
||||
- **Quickstart Code Example:** [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py)
|
||||
- **Examples Folder:** [Crawl4AI Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples)
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Crawl4AI Quick Start Guide: Your All-in-One AI-Ready Web Crawling \& AI Integration Solution](#crawl4ai-quick-start-guide-your-all-in-one-ai-ready-web-crawling--ai-integration-solution)
|
||||
- [Table of Contents](#table-of-contents)
|
||||
- [1. Introduction \& Key Concepts](#1-introduction--key-concepts)
|
||||
- [2. Installation \& Environment Setup](#2-installation--environment-setup)
|
||||
- [Test Your Installation](#test-your-installation)
|
||||
- [3. Core Concepts \& Configuration](#3-core-concepts--configuration)
|
||||
- [4. Basic Crawling \& Simple Extraction](#4-basic-crawling--simple-extraction)
|
||||
- [5. Markdown Generation \& AI-Optimized Output](#5-markdown-generation--ai-optimized-output)
|
||||
@@ -38,15 +57,17 @@ Crawl4AI, the **#1 trending GitHub repository**, streamlines web content extract
|
||||
---
|
||||
|
||||
## 1. Introduction & Key Concepts
|
||||
|
||||
Crawl4AI transforms websites into structured, AI-friendly data. It efficiently handles large-scale crawling, integrates with both proprietary and open-source LLMs, and optimizes content for semantic search or RAG pipelines.
|
||||
|
||||
**Quick Test:**
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def test_run():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(result.markdown)
|
||||
|
||||
@@ -60,12 +81,41 @@ If you see Markdown output, everything is working!
|
||||
---
|
||||
|
||||
## 2. Installation & Environment Setup
|
||||
|
||||
```bash
|
||||
# Install the package
|
||||
pip install crawl4ai
|
||||
crawl4ai-setup
|
||||
playwright install chromium
|
||||
|
||||
# Install Playwright with system dependencies (recommended)
|
||||
playwright install --with-deps # Installs all browsers
|
||||
|
||||
# Or install specific browsers:
|
||||
playwright install --with-deps chrome # Recommended for Colab/Linux
|
||||
playwright install --with-deps firefox
|
||||
playwright install --with-deps webkit
|
||||
playwright install --with-deps chromium
|
||||
|
||||
# Keep Playwright updated periodically
|
||||
playwright install
|
||||
```
|
||||
|
||||
> **Note**: For Google Colab and some Linux environments, use `chrome` instead of `chromium` - it tends to work more reliably.
|
||||
|
||||
### Test Your Installation
|
||||
Try these one-liners:
|
||||
|
||||
```python
|
||||
# Visible browser test
|
||||
python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=False); page = browser.new_page(); page.goto('https://example.com'); input('Press Enter to close...')"
|
||||
|
||||
# Headless test (for servers/CI)
|
||||
python -c "from playwright.sync_api import sync_playwright; p = sync_playwright().start(); browser = p.chromium.launch(headless=True); page = browser.new_page(); page.goto('https://example.com'); print(f'Title: {page.title()}'); browser.close()"
|
||||
```
|
||||
|
||||
You should see a browser window (in visible test) loading example.com. If you get errors, try with Firefox using `playwright install --with-deps firefox`.
|
||||
|
||||
|
||||
**Try in Colab:**
|
||||
[Open Colab Notebook](https://colab.research.google.com/drive/1SgRPrByQLzjRfwoRNq1wSGE9nYY_EE8C?usp=sharing)
|
||||
|
||||
@@ -74,16 +124,19 @@ playwright install chromium
|
||||
---
|
||||
|
||||
## 3. Core Concepts & Configuration
|
||||
|
||||
Use `AsyncWebCrawler`, `CrawlerRunConfig`, and `BrowserConfig` to control crawling.
|
||||
|
||||
**Example config:**
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
verbose=True,
|
||||
viewport_width=1080,
|
||||
viewport_height=600,
|
||||
text_mode=False,
|
||||
ignore_https_errors=True,
|
||||
java_script_enabled=True
|
||||
@@ -97,7 +150,7 @@ run_config = CrawlerRunConfig(
|
||||
wait_for="css:.article-loaded",
|
||||
page_timeout=60000,
|
||||
delay_before_return_html=1.0,
|
||||
mean_delay=0.1,
|
||||
mean_delay=0.1,
|
||||
max_range=0.3,
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True,
|
||||
@@ -115,15 +168,17 @@ run_config = CrawlerRunConfig(
|
||||
```
|
||||
|
||||
**Prefixes:**
|
||||
- `http://` or `https://` for live pages
|
||||
- `file://local.html` for local
|
||||
- `raw:<html>` for raw HTML strings
|
||||
|
||||
- `http://` or `https://` for live pages
|
||||
- `file://local.html` for local
|
||||
- `raw:<html>` for raw HTML strings
|
||||
|
||||
**More info:** [See /docs/async_webcrawler](#) or [3_async_webcrawler.ex.md](https://github.com/unclecode/crawl4ai/blob/main/async_webcrawler.ex.md)
|
||||
|
||||
---
|
||||
|
||||
## 4. Basic Crawling & Simple Extraction
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://news.example.com/article", config=run_config)
|
||||
@@ -137,13 +192,15 @@ async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
## 5. Markdown Generation & AI-Optimized Output
|
||||
|
||||
After crawling, `result.markdown_v2` provides:
|
||||
- `raw_markdown`: Unfiltered markdown
|
||||
- `markdown_with_citations`: Links as references at the bottom
|
||||
- `references_markdown`: A separate list of reference links
|
||||
- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25)
|
||||
- `fit_html`: The HTML used to produce `fit_markdown`
|
||||
|
||||
- `raw_markdown`: Unfiltered markdown
|
||||
- `markdown_with_citations`: Links as references at the bottom
|
||||
- `references_markdown`: A separate list of reference links
|
||||
- `fit_markdown`: Filtered, relevant markdown (e.g., after BM25)
|
||||
- `fit_html`: The HTML used to produce `fit_markdown`
|
||||
|
||||
**Example:**
|
||||
|
||||
```python
|
||||
print("RAW:", result.markdown_v2.raw_markdown[:200])
|
||||
print("CITED:", result.markdown_v2.markdown_with_citations[:200])
|
||||
@@ -158,9 +215,11 @@ For AI training, `fit_markdown` focuses on the most relevant content.
|
||||
---
|
||||
|
||||
## 6. Structured Data Extraction (CSS, XPath, LLM)
|
||||
|
||||
Extract JSON data without LLMs:
|
||||
|
||||
**CSS:**
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
@@ -176,6 +235,7 @@ run_config.extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
```
|
||||
|
||||
**XPath:**
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
||||
|
||||
@@ -195,6 +255,7 @@ run_config.extraction_strategy = JsonXPathExtractionStrategy(xpath_schema)
|
||||
---
|
||||
|
||||
## 7. Advanced Extraction: LLM & Open-Source Models
|
||||
|
||||
Use LLMExtractionStrategy for complex tasks. Works with OpenAI or open-source models (e.g., Ollama).
|
||||
|
||||
```python
|
||||
@@ -217,7 +278,9 @@ run_config.extraction_strategy = LLMExtractionStrategy(
|
||||
---
|
||||
|
||||
## 8. Page Interactions, JS Execution, & Dynamic Content
|
||||
|
||||
Insert `js_code` and use `wait_for` to ensure content loads. Example:
|
||||
|
||||
```python
|
||||
run_config.js_code = """
|
||||
(async () => {
|
||||
@@ -233,6 +296,7 @@ run_config.wait_for = "css:.item-loaded"
|
||||
---
|
||||
|
||||
## 9. Media, Links, & Metadata Handling
|
||||
|
||||
`result.media["images"]`: List of images with `src`, `score`, `alt`. Score indicates relevance.
|
||||
|
||||
`result.media["videos"]`, `result.media["audios"]` similarly hold media info.
|
||||
@@ -242,6 +306,7 @@ run_config.wait_for = "css:.item-loaded"
|
||||
`result.metadata`: Title, description, keywords, author.
|
||||
|
||||
**Example:**
|
||||
|
||||
```python
|
||||
# Images
|
||||
for img in result.media["images"]:
|
||||
@@ -263,30 +328,37 @@ print("Description:", result.metadata["description"])
|
||||
## 10. Authentication & Identity Preservation
|
||||
|
||||
### Manual Setup via User Data Directory
|
||||
|
||||
1. **Open Chrome with a custom user data dir:**
|
||||
```bash
|
||||
"C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile"
|
||||
```
|
||||
On macOS:
|
||||
```bash
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile"
|
||||
```
|
||||
|
||||
```bash
|
||||
"C:\Program Files\Google\Chrome\Application\chrome.exe" --user-data-dir="C:\MyChromeProfile"
|
||||
```
|
||||
|
||||
On macOS:
|
||||
|
||||
```bash
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" --user-data-dir="/Users/username/ChromeProfiles/MyProfile"
|
||||
```
|
||||
|
||||
2. **Log in to sites, solve CAPTCHAs, adjust settings manually.**
|
||||
The browser saves cookies/localStorage in that directory.
|
||||
|
||||
3. **Use `user_data_dir` in `BrowserConfig`:**
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
user_data_dir="/Users/username/ChromeProfiles/MyProfile"
|
||||
)
|
||||
```
|
||||
|
||||
Now the crawler starts with those cookies, sessions, etc.
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
user_data_dir="/Users/username/ChromeProfiles/MyProfile"
|
||||
)
|
||||
```
|
||||
|
||||
Now the crawler starts with those cookies, sessions, etc.
|
||||
|
||||
### Using `storage_state`
|
||||
|
||||
Alternatively, export and reuse storage states:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
@@ -301,7 +373,9 @@ No repeated logins needed.
|
||||
---
|
||||
|
||||
## 11. Proxy & Security Enhancements
|
||||
|
||||
Use `proxy_config` for authenticated proxies:
|
||||
|
||||
```python
|
||||
browser_config.proxy_config = {
|
||||
"server": "http://proxy.example.com:8080",
|
||||
@@ -317,6 +391,7 @@ Combine with `headers` or `ignore_https_errors` as needed.
|
||||
---
|
||||
|
||||
## 12. Screenshots, PDFs & File Downloads
|
||||
|
||||
Enable `screenshot=True` or `pdf=True` in `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
@@ -325,6 +400,7 @@ run_config.pdf = True
|
||||
```
|
||||
|
||||
After crawling:
|
||||
|
||||
```python
|
||||
if result.screenshot:
|
||||
with open("page.png", "wb") as f:
|
||||
@@ -336,6 +412,7 @@ if result.pdf:
|
||||
```
|
||||
|
||||
**File Downloads:**
|
||||
|
||||
```python
|
||||
browser_config.accept_downloads = True
|
||||
browser_config.downloads_path = "./downloads"
|
||||
@@ -351,7 +428,9 @@ Also [10_file_download.md](https://github.com/unclecode/crawl4ai/blob/main/file_
|
||||
---
|
||||
|
||||
## 13. Caching & Performance Optimization
|
||||
|
||||
Set `cache_mode` to reuse fetch results:
|
||||
|
||||
```python
|
||||
from crawl4ai import CacheMode
|
||||
run_config.cache_mode = CacheMode.ENABLED
|
||||
@@ -364,11 +443,13 @@ Adjust delays, increase concurrency, or use `text_mode=True` for faster extracti
|
||||
---
|
||||
|
||||
## 14. Hooks for Custom Logic
|
||||
|
||||
Hooks let you run code at specific lifecycle events without creating pages manually in `on_browser_created`.
|
||||
|
||||
Use `on_page_context_created` to apply routing or modify page contexts before crawling the URL:
|
||||
|
||||
**Example Hook:**
|
||||
|
||||
```python
|
||||
async def on_page_context_created_hook(context, page, **kwargs):
|
||||
# Block all images to speed up load
|
||||
@@ -388,21 +469,25 @@ This hook is clean and doesn’t create a separate page itself—it just modifie
|
||||
---
|
||||
|
||||
## 15. Dockerization & Scaling
|
||||
|
||||
Use Docker images:
|
||||
|
||||
- AMD64 basic:
|
||||
- AMD64 basic:
|
||||
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:basic-amd64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64
|
||||
```
|
||||
|
||||
- ARM64 for M1/M2:
|
||||
- ARM64 for M1/M2:
|
||||
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:basic-arm64
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64
|
||||
```
|
||||
|
||||
- GPU support:
|
||||
- GPU support:
|
||||
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:gpu-amd64
|
||||
docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu-amd64
|
||||
@@ -415,25 +500,28 @@ Scale with load balancers or Kubernetes.
|
||||
---
|
||||
|
||||
## 16. Troubleshooting & Common Pitfalls
|
||||
- Empty results? Relax filters, check selectors.
|
||||
- Timeouts? Increase `page_timeout` or refine `wait_for`.
|
||||
- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving.
|
||||
- JS errors? Try headful mode for debugging.
|
||||
|
||||
- Empty results? Relax filters, check selectors.
|
||||
- Timeouts? Increase `page_timeout` or refine `wait_for`.
|
||||
- CAPTCHAs? Use `user_data_dir` or `storage_state` after manual solving.
|
||||
- JS errors? Try headful mode for debugging.
|
||||
|
||||
Check [examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) & [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for more code.
|
||||
|
||||
---
|
||||
|
||||
## 17. Comprehensive End-to-End Example
|
||||
|
||||
Combine hooks, JS execution, PDF saving, LLM extraction—see [quickstart_async.config.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_async.config.py) for a full example.
|
||||
|
||||
---
|
||||
|
||||
## 18. Further Resources & Community
|
||||
- **Docs:** [https://crawl4ai.com](https://crawl4ai.com)
|
||||
- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues)
|
||||
|
||||
- **Docs:** [https://crawl4ai.com](https://crawl4ai.com)
|
||||
- **Issues & PRs:** [https://github.com/unclecode/crawl4ai/issues](https://github.com/unclecode/crawl4ai/issues)
|
||||
|
||||
Follow [@unclecode](https://x.com/unclecode) for news & community updates.
|
||||
|
||||
**Happy Crawling!**
|
||||
Leverage Crawl4AI to feed your AI models with clean, structured web data today.
|
||||
Leverage Crawl4AI to feed your AI models with clean, structured web data today.
|
||||
|
||||
@@ -65,7 +65,7 @@
|
||||
|
||||
#### `viewport_width` and `viewport_height`
|
||||
- **Description**: Sets the default browser viewport dimensions.
|
||||
- Default: `1920` (width), `1080` (height)
|
||||
- Default: `1080` (width), `600` (height)
|
||||
- **Use Case**:
|
||||
- Adjust for crawling responsive layouts or specific device emulations.
|
||||
|
||||
@@ -134,6 +134,19 @@
|
||||
- **Use Case**:
|
||||
- Use for advanced browser configurations like WebRTC or GPU tuning.
|
||||
|
||||
#### `verbose`
|
||||
- **Description**: Enable verbose logging of browser operations.
|
||||
- Default: `True`
|
||||
- **Use Case**:
|
||||
- Enable for detailed logging during development and debugging.
|
||||
- Disable in production for better performance.
|
||||
|
||||
#### `sleep_on_close`
|
||||
- **Description**: Adds a delay before closing the browser.
|
||||
- Default: `False`
|
||||
- **Use Case**:
|
||||
- Enable when you need to ensure all browser operations are complete before closing.
|
||||
|
||||
## CrawlerRunConfig
|
||||
The `CrawlerRunConfig` class centralizes parameters for controlling crawl operations. This configuration covers content extraction, page interactions, caching, and runtime behaviors. Below is an exhaustive breakdown of parameters and their best-use scenarios.
|
||||
|
||||
@@ -341,3 +354,37 @@ The `CrawlerRunConfig` class centralizes parameters for controlling crawl operat
|
||||
- **Use Case**:
|
||||
- Enable when debugging JavaScript errors on pages.
|
||||
|
||||
##### `parser_type`
|
||||
- **Description**: Type of parser to use for HTML parsing.
|
||||
- Default: `"lxml"`
|
||||
- **Use Case**:
|
||||
- Use when specific HTML parsing requirements are needed.
|
||||
- `"lxml"` provides good performance and standards compliance.
|
||||
|
||||
##### `prettiify`
|
||||
- **Description**: Apply `fast_format_html` to produce prettified HTML output.
|
||||
- Default: `False`
|
||||
- **Use Case**:
|
||||
- Enable for better readability of extracted HTML content.
|
||||
- Useful during development and debugging.
|
||||
|
||||
##### `fetch_ssl_certificate`
|
||||
- **Description**: Fetch and store SSL certificate information during crawling.
|
||||
- Default: `False`
|
||||
- **Use Case**:
|
||||
- Enable when SSL certificate analysis is required.
|
||||
- Useful for security audits and certificate validation.
|
||||
|
||||
##### `url`
|
||||
- **Description**: Target URL for the crawl operation.
|
||||
- Default: `None`
|
||||
- **Use Case**:
|
||||
- Set when initializing a crawler for a specific URL.
|
||||
- Can be overridden during actual crawl operations.
|
||||
|
||||
##### `log_console`
|
||||
- **Description**: Log browser console messages during crawling.
|
||||
- Default: `False`
|
||||
- **Use Case**:
|
||||
- Enable to capture JavaScript console output.
|
||||
- Useful for debugging JavaScript-heavy pages.
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
This document provides a comprehensive, human-oriented overview of the `AsyncWebCrawler` class and related components from the `crawl4ai` package. It explains the motivations behind asynchronous crawling, shows how to configure and run crawls, and provides examples for advanced features like dynamic content handling, extraction strategies, caching, containerization, and troubleshooting.
|
||||
|
||||
## Introduction
|
||||
[EDIT: This is not a good way to introduce the library. The library excels at generating crawl data in the form of markdown or extracted JSON as quickly as possible. It is designed to be efficient in terms of memory and CPU usage. Users should choose this library because it generates markdown suitable for large language models and AI. Additionally, it can create structured data, which is beneficial because it supports attaching large language models to generate structured data. It also includes techniques like JSON CSS and JSON XPath extraction, allowing users to define patterns and extract data quickly. One of the library's strengths is its ability to work everywhere. It can crawl any website by offering various capabilities, such as connecting to a remote browser or using persistent data. This feature allows developers to create their own identity on websites where they have authentication access, enabling them to crawl without being mistakenly identified as a bot. This is a better way to introduce the library. In these documents, we discuss the main object, the main class, Asinggull crawlers, and all the functionalities we can achieve with this Asinggull crawler.]
|
||||
|
||||
Crawling websites can be slow if done sequentially, especially when handling large numbers of URLs or rendering dynamic pages. Asynchronous crawling helps you run multiple operations concurrently, improving throughput and performance. The `AsyncWebCrawler` class leverages asynchronous I/O and browser automation tools to fetch content efficiently, handle complex DOM interactions, and extract structured data.
|
||||
|
||||
|
||||
@@ -74,9 +74,10 @@ The Markdown generation process transforms raw HTML into a structured format. At
|
||||
|
||||
```python
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler
|
||||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
options={
|
||||
"ignore_links": True,
|
||||
|
||||
@@ -310,22 +310,6 @@ response = requests.post("http://localhost:11235/crawl", json=request)
|
||||
> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Usage Examples 📝
|
||||
|
||||
### Basic Crawling
|
||||
|
||||
Reference in New Issue
Block a user