diff --git a/.local/.chainlit/config.toml b/.local/.chainlit/config.toml deleted file mode 100644 index 810b06f3..00000000 --- a/.local/.chainlit/config.toml +++ /dev/null @@ -1,121 +0,0 @@ -[project] -# Whether to enable telemetry (default: true). No personal data is collected. -enable_telemetry = true - - -# List of environment variables to be provided by each user to use the app. -user_env = [] - -# Duration (in seconds) during which the session is saved when the connection is lost -session_timeout = 3600 - -# Enable third parties caching (e.g LangChain cache) -cache = false - -# Authorized origins -allow_origins = ["*"] - -# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317) -# follow_symlink = false - -[features] -# Show the prompt playground -prompt_playground = true - -# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript) -unsafe_allow_html = false - -# Process and display mathematical expressions. This can clash with "$" characters in messages. -latex = false - -# Automatically tag threads with the current chat profile (if a chat profile is used) -auto_tag_thread = true - -# Authorize users to spontaneously upload files with messages -[features.spontaneous_file_upload] - enabled = true - accept = ["*/*"] - max_files = 20 - max_size_mb = 500 - -[features.audio] - # Threshold for audio recording - min_decibels = -45 - # Delay for the user to start speaking in MS - initial_silence_timeout = 3000 - # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop. - silence_timeout = 1500 - # Above this duration (MS), the recording will forcefully stop. - max_duration = 15000 - # Duration of the audio chunks in MS - chunk_duration = 1000 - # Sample rate of the audio - sample_rate = 44100 - -[UI] -# Name of the app and chatbot. -name = "Chatbot" - -# Show the readme while the thread is empty. -show_readme_as_default = true - -# Description of the app and chatbot. This is used for HTML tags. -# description = "" - -# Large size content are by default collapsed for a cleaner ui -default_collapse_content = true - -# The default value for the expand messages settings. -default_expand_messages = false - -# Hide the chain of thought details from the user in the UI. -hide_cot = false - -# Link to your github repo. This will add a github button in the UI's header. -# github = "" - -# Specify a CSS file that can be used to customize the user interface. -# The CSS file can be served from the public directory or via an external link. -# custom_css = "/public/test.css" - -# Specify a Javascript file that can be used to customize the user interface. -# The Javascript file can be served from the public directory. -# custom_js = "/public/test.js" - -# Specify a custom font url. -# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap" - -# Specify a custom meta image url. -# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png" - -# Specify a custom build directory for the frontend. -# This can be used to customize the frontend code. -# Be careful: If this is a relative path, it should not start with a slash. -# custom_build = "./public/build" - -[UI.theme] - #layout = "wide" - #font_family = "Inter, sans-serif" -# Override default MUI light theme. (Check theme.ts) -[UI.theme.light] - #background = "#FAFAFA" - #paper = "#FFFFFF" - - [UI.theme.light.primary] - #main = "#F80061" - #dark = "#980039" - #light = "#FFE7EB" - -# Override default MUI dark theme. (Check theme.ts) -[UI.theme.dark] - #background = "#FAFAFA" - #paper = "#FFFFFF" - - [UI.theme.dark.primary] - #main = "#F80061" - #dark = "#980039" - #light = "#FFE7EB" - - -[meta] -generated_by = "1.1.202" diff --git a/.local/.chainlit/translations/en-US.json b/.local/.chainlit/translations/en-US.json deleted file mode 100644 index 0bca7207..00000000 --- a/.local/.chainlit/translations/en-US.json +++ /dev/null @@ -1,231 +0,0 @@ -{ - "components": { - "atoms": { - "buttons": { - "userButton": { - "menu": { - "settings": "Settings", - "settingsKey": "S", - "APIKeys": "API Keys", - "logout": "Logout" - } - } - } - }, - "molecules": { - "newChatButton": { - "newChat": "New Chat" - }, - "tasklist": { - "TaskList": { - "title": "\ud83d\uddd2\ufe0f Task List", - "loading": "Loading...", - "error": "An error occured" - } - }, - "attachments": { - "cancelUpload": "Cancel upload", - "removeAttachment": "Remove attachment" - }, - "newChatDialog": { - "createNewChat": "Create new chat?", - "clearChat": "This will clear the current messages and start a new chat.", - "cancel": "Cancel", - "confirm": "Confirm" - }, - "settingsModal": { - "settings": "Settings", - "expandMessages": "Expand Messages", - "hideChainOfThought": "Hide Chain of Thought", - "darkMode": "Dark Mode" - }, - "detailsButton": { - "using": "Using", - "running": "Running", - "took_one": "Took {{count}} step", - "took_other": "Took {{count}} steps" - }, - "auth": { - "authLogin": { - "title": "Login to access the app.", - "form": { - "email": "Email address", - "password": "Password", - "noAccount": "Don't have an account?", - "alreadyHaveAccount": "Already have an account?", - "signup": "Sign Up", - "signin": "Sign In", - "or": "OR", - "continue": "Continue", - "forgotPassword": "Forgot password?", - "passwordMustContain": "Your password must contain:", - "emailRequired": "email is a required field", - "passwordRequired": "password is a required field" - }, - "error": { - "default": "Unable to sign in.", - "signin": "Try signing in with a different account.", - "oauthsignin": "Try signing in with a different account.", - "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.", - "oauthcallbackerror": "Try signing in with a different account.", - "oauthcreateaccount": "Try signing in with a different account.", - "emailcreateaccount": "Try signing in with a different account.", - "callback": "Try signing in with a different account.", - "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.", - "emailsignin": "The e-mail could not be sent.", - "emailverify": "Please verify your email, a new email has been sent.", - "credentialssignin": "Sign in failed. Check the details you provided are correct.", - "sessionrequired": "Please sign in to access this page." - } - }, - "authVerifyEmail": { - "almostThere": "You're almost there! We've sent an email to ", - "verifyEmailLink": "Please click on the link in that email to complete your signup.", - "didNotReceive": "Can't find the email?", - "resendEmail": "Resend email", - "goBack": "Go Back", - "emailSent": "Email sent successfully.", - "verifyEmail": "Verify your email address" - }, - "providerButton": { - "continue": "Continue with {{provider}}", - "signup": "Sign up with {{provider}}" - }, - "authResetPassword": { - "newPasswordRequired": "New password is a required field", - "passwordsMustMatch": "Passwords must match", - "confirmPasswordRequired": "Confirm password is a required field", - "newPassword": "New password", - "confirmPassword": "Confirm password", - "resetPassword": "Reset Password" - }, - "authForgotPassword": { - "email": "Email address", - "emailRequired": "email is a required field", - "emailSent": "Please check the email address {{email}} for instructions to reset your password.", - "enterEmail": "Enter your email address and we will send you instructions to reset your password.", - "resendEmail": "Resend email", - "continue": "Continue", - "goBack": "Go Back" - } - } - }, - "organisms": { - "chat": { - "history": { - "index": { - "showHistory": "Show history", - "lastInputs": "Last Inputs", - "noInputs": "Such empty...", - "loading": "Loading..." - } - }, - "inputBox": { - "input": { - "placeholder": "Type your message here..." - }, - "speechButton": { - "start": "Start recording", - "stop": "Stop recording" - }, - "SubmitButton": { - "sendMessage": "Send message", - "stopTask": "Stop Task" - }, - "UploadButton": { - "attachFiles": "Attach files" - }, - "waterMark": { - "text": "Built with" - } - }, - "Messages": { - "index": { - "running": "Running", - "executedSuccessfully": "executed successfully", - "failed": "failed", - "feedbackUpdated": "Feedback updated", - "updating": "Updating" - } - }, - "dropScreen": { - "dropYourFilesHere": "Drop your files here" - }, - "index": { - "failedToUpload": "Failed to upload", - "cancelledUploadOf": "Cancelled upload of", - "couldNotReachServer": "Could not reach the server", - "continuingChat": "Continuing previous chat" - }, - "settings": { - "settingsPanel": "Settings panel", - "reset": "Reset", - "cancel": "Cancel", - "confirm": "Confirm" - } - }, - "threadHistory": { - "sidebar": { - "filters": { - "FeedbackSelect": { - "feedbackAll": "Feedback: All", - "feedbackPositive": "Feedback: Positive", - "feedbackNegative": "Feedback: Negative" - }, - "SearchBar": { - "search": "Search" - } - }, - "DeleteThreadButton": { - "confirmMessage": "This will delete the thread as well as it's messages and elements.", - "cancel": "Cancel", - "confirm": "Confirm", - "deletingChat": "Deleting chat", - "chatDeleted": "Chat deleted" - }, - "index": { - "pastChats": "Past Chats" - }, - "ThreadList": { - "empty": "Empty...", - "today": "Today", - "yesterday": "Yesterday", - "previous7days": "Previous 7 days", - "previous30days": "Previous 30 days" - }, - "TriggerButton": { - "closeSidebar": "Close sidebar", - "openSidebar": "Open sidebar" - } - }, - "Thread": { - "backToChat": "Go back to chat", - "chatCreatedOn": "This chat was created on" - } - }, - "header": { - "chat": "Chat", - "readme": "Readme" - } - } - }, - "hooks": { - "useLLMProviders": { - "failedToFetchProviders": "Failed to fetch providers:" - } - }, - "pages": { - "Design": {}, - "Env": { - "savedSuccessfully": "Saved successfully", - "requiredApiKeys": "Required API Keys", - "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage." - }, - "Page": { - "notPartOfProject": "You are not part of this project." - }, - "ResumeButton": { - "resumeChat": "Resume Chat" - } - } -} \ No newline at end of file diff --git a/.local/126/chromedriver b/.local/126/chromedriver deleted file mode 100755 index 6f7de54a..00000000 Binary files a/.local/126/chromedriver and /dev/null differ diff --git a/.local/issues_todo.md b/.local/issues_todo.md deleted file mode 100644 index 61bdc855..00000000 --- a/.local/issues_todo.md +++ /dev/null @@ -1 +0,0 @@ -Docker: https://github.com/unclecode/crawl4ai/issues/367 \ No newline at end of file diff --git a/.local/llm.txt/10_file_download.md b/.local/llm.txt/10_file_download.md deleted file mode 100644 index eac0f5cb..00000000 --- a/.local/llm.txt/10_file_download.md +++ /dev/null @@ -1,129 +0,0 @@ -# Download Handling in Crawl4AI - -This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. - -## Enabling Downloads - -To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler. - -```python -from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler - -async def main(): - config = BrowserConfig(accept_downloads=True) # Enable downloads globally - async with AsyncWebCrawler(config=config) as crawler: - # ... your crawling logic ... - -asyncio.run(main()) -``` - -Or, enable it for a specific crawl by using `CrawlerRunConfig`: - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -async def main(): - async with AsyncWebCrawler() as crawler: - config = CrawlerRunConfig(accept_downloads=True) - result = await crawler.arun(url="https://example.com", config=config) - # ... -``` - -## Specifying Download Location - -Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory. - -```python -from crawl4ai.async_configs import BrowserConfig -import os - -downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path -os.makedirs(downloads_path, exist_ok=True) - -config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path) - -async def main(): - async with AsyncWebCrawler(config=config) as crawler: - result = await crawler.arun(url="https://example.com") - # ... -``` - -## Triggering Downloads - -Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start. - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -config = CrawlerRunConfig( - js_code=""" - const downloadLink = document.querySelector('a[href$=".exe"]'); - if (downloadLink) { - downloadLink.click(); - } - """, - wait_for=5 # Wait 5 seconds for the download to start -) - -result = await crawler.arun(url="https://www.python.org/downloads/", config=config) -``` - -## Accessing Downloaded Files - -The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files. - -```python -if result.downloaded_files: - print("Downloaded files:") - for file_path in result.downloaded_files: - print(f"- {file_path}") - file_size = os.path.getsize(file_path) - print(f"- File size: {file_size} bytes") -else: - print("No files downloaded.") -``` - -## Example: Downloading Multiple Files - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -import os -from pathlib import Path - -async def download_multiple_files(url: str, download_path: str): - config = BrowserConfig(accept_downloads=True, downloads_path=download_path) - async with AsyncWebCrawler(config=config) as crawler: - run_config = CrawlerRunConfig( - js_code=""" - const downloadLinks = document.querySelectorAll('a[download]'); - for (const link of downloadLinks) { - link.click(); - await new Promise(r => setTimeout(r, 2000)); // Delay between clicks - } - """, - wait_for=10 # Wait for all downloads to start - ) - result = await crawler.arun(url=url, config=run_config) - - if result.downloaded_files: - print("Downloaded files:") - for file in result.downloaded_files: - print(f"- {file}") - else: - print("No files downloaded.") - -# Usage -download_path = os.path.join(Path.home(), ".crawl4ai", "downloads") -os.makedirs(download_path, exist_ok=True) - -asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path)) -``` - -## Important Considerations - -- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage. -- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing. -- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully. -- **Security:** Scan downloaded files for potential security threats before use. - -This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed! \ No newline at end of file diff --git a/.local/llm.txt/10_file_download.q.md b/.local/llm.txt/10_file_download.q.md deleted file mode 100644 index 987149e7..00000000 --- a/.local/llm.txt/10_file_download.q.md +++ /dev/null @@ -1,10 +0,0 @@ -enable_downloads: Downloads must be enabled using accept_downloads parameter in BrowserConfig or CrawlerRunConfig | download settings, enable downloads | BrowserConfig(accept_downloads=True) -download_location: Set custom download directory using downloads_path in BrowserConfig, defaults to .crawl4ai/downloads | download folder, save location | BrowserConfig(downloads_path="/path/to/downloads") -download_trigger: Trigger downloads using js_code in CrawlerRunConfig to simulate click actions | download button, click download | CrawlerRunConfig(js_code="document.querySelector('a[download]').click()") -download_timing: Control download timing using wait_for parameter in CrawlerRunConfig | download wait, timeout | CrawlerRunConfig(wait_for=5) -access_downloads: Access downloaded files through downloaded_files attribute in CrawlResult | download results, file paths | result.downloaded_files -multiple_downloads: Download multiple files by clicking multiple download links with delay | batch download, multiple files | js_code="const links = document.querySelectorAll('a[download]'); for(const link of links) { link.click(); }" -download_verification: Check download success by examining downloaded_files list and file sizes | verify downloads, file check | if result.downloaded_files: print(os.path.getsize(file_path)) -browser_context: Downloads are managed within browser context and require proper js_code targeting | download management, browser scope | CrawlerRunConfig(js_code="...") -error_handling: Handle failed downloads and incorrect paths for robust download management | download errors, error handling | try-except around download operations -security_consideration: Scan downloaded files for security threats before use | security check, virus scan | No direct code reference \ No newline at end of file diff --git a/.local/llm.txt/11_page_interaction.md b/.local/llm.txt/11_page_interaction.md deleted file mode 100644 index 2a60ae85..00000000 --- a/.local/llm.txt/11_page_interaction.md +++ /dev/null @@ -1,190 +0,0 @@ -# Page Interaction - -Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events. - -## JavaScript Execution - -### Basic Execution - -```python -from crawl4ai.async_configs import CrawlerRunConfig - -# Single JavaScript command -config = CrawlerRunConfig( - js_code="window.scrollTo(0, document.body.scrollHeight);" -) -result = await crawler.arun(url="https://example.com", config=config) - -# Multiple commands -js_commands = [ - "window.scrollTo(0, document.body.scrollHeight);", - "document.querySelector('.load-more').click();", - "document.querySelector('#consent-button').click();" -] -config = CrawlerRunConfig(js_code=js_commands) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Wait Conditions - -### CSS-Based Waiting - -Wait for elements to appear: - -```python -config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content' -result = await crawler.arun(url="https://example.com", config=config) -``` - -### JavaScript-Based Waiting - -Wait for custom conditions: - -```python -# Wait for number of elements -wait_condition = """() => { - return document.querySelectorAll('.item').length > 10; -}""" - -config = CrawlerRunConfig(wait_for=f"js:{wait_condition}") -result = await crawler.arun(url="https://example.com", config=config) - -# Wait for dynamic content to load -wait_for_content = """() => { - const content = document.querySelector('.content'); - return content && content.innerText.length > 100; -}""" - -config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}") -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Handling Dynamic Content - -### Load More Content - -Handle infinite scroll or load more buttons: - -```python -config = CrawlerRunConfig( - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom - "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more - ], - wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Form Interaction - -Handle forms and inputs: - -```python -js_form_interaction = """ - document.querySelector('#search').value = 'search term'; // Fill form fields - document.querySelector('form').submit(); // Submit form -""" - -config = CrawlerRunConfig( - js_code=js_form_interaction, - wait_for="css:.results" # Wait for results to load -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Timing Control - -### Delays and Timeouts - -Control timing of interactions: - -```python -config = CrawlerRunConfig( - page_timeout=60000, # Page load timeout (ms) - delay_before_return_html=2.0 # Wait before capturing content -) -result = await crawler.arun(url="https://example.com", config=config) -``` - -### Complex Interactions Example - -Here's an example of handling a dynamic page with multiple interactions: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -async def crawl_dynamic_content(): - async with AsyncWebCrawler() as crawler: - # Initial page load - config = CrawlerRunConfig( - js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent - wait_for="css:.main-content" - ) - result = await crawler.arun(url="https://example.com", config=config) - - # Load more content - session_id = "dynamic_session" # Keep session for multiple interactions - - for page in range(3): # Load 3 pages of content - config = CrawlerRunConfig( - session_id=session_id, - js_code=[ - "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom - "window.previousCount = document.querySelectorAll('.item').length;", # Store item count - "document.querySelector('.load-more')?.click();" # Click load more - ], - wait_for="""() => { - const currentCount = document.querySelectorAll('.item').length; - return currentCount > window.previousCount; - }""", - js_only=(page > 0) # Execute JS without reloading page for subsequent interactions - ) - result = await crawler.arun(url="https://example.com", config=config) - print(f"Page {page + 1} items:", len(result.cleaned_html)) - - # Clean up session - await crawler.crawler_strategy.kill_session(session_id) -``` - -### Using with Extraction Strategies - -Combine page interaction with structured extraction: - -```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy -from crawl4ai.async_configs import CrawlerRunConfig - -# Pattern-based extraction after interaction -schema = { - "name": "Dynamic Items", - "baseSelector": ".item", - "fields": [ - {"name": "title", "selector": "h2", "type": "text"}, - {"name": "description", "selector": ".desc", "type": "text"} - ] -} - -config = CrawlerRunConfig( - js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="css:.item:nth-child(10)", # Wait for 10 items - extraction_strategy=JsonCssExtractionStrategy(schema) -) -result = await crawler.arun(url="https://example.com", config=config) - -# Or use LLM to analyze dynamic content -class ContentAnalysis(BaseModel): - topics: List[str] - summary: str - -config = CrawlerRunConfig( - js_code="document.querySelector('.show-more').click();", - wait_for="css:.full-content", - extraction_strategy=LLMExtractionStrategy( - provider="ollama/nemotron", - schema=ContentAnalysis.schema(), - instruction="Analyze the full content" - ) -) -result = await crawler.arun(url="https://example.com", config=config) -``` diff --git a/.local/llm.txt/11_page_interaction.q.md b/.local/llm.txt/11_page_interaction.q.md deleted file mode 100644 index a28e5b11..00000000 --- a/.local/llm.txt/11_page_interaction.q.md +++ /dev/null @@ -1,10 +0,0 @@ -javascript_execution: Execute single or multiple JavaScript commands in webpage | js code, javascript commands, browser execution | CrawlerRunConfig(js_code="window.scrollTo(0, document.body.scrollHeight);") -css_wait: Wait for specific CSS elements to appear on page | css selector, element waiting, dynamic content | CrawlerRunConfig(wait_for="css:.dynamic-content") -js_wait_condition: Define custom JavaScript wait conditions for dynamic content | javascript waiting, conditional wait, custom conditions | CrawlerRunConfig(wait_for="js:() => document.querySelectorAll('.item').length > 10") -infinite_scroll: Handle infinite scroll and load more buttons | pagination, dynamic loading, scroll handling | CrawlerRunConfig(js_code="window.scrollTo(0, document.body.scrollHeight);") -form_interaction: Fill and submit forms using JavaScript | form handling, input filling, form submission | CrawlerRunConfig(js_code="document.querySelector('#search').value = 'search term';") -timing_control: Set page timeouts and delays before content capture | page timing, delays, timeouts | CrawlerRunConfig(page_timeout=60000, delay_before_return_html=2.0) -session_management: Maintain browser session for multiple interactions | session handling, browser state, session cleanup | crawler.crawler_strategy.kill_session(session_id) -cookie_consent: Handle cookie consent popups and notifications | cookie handling, popup management | CrawlerRunConfig(js_code="document.querySelector('.cookie-accept')?.click();") -extraction_combination: Combine page interactions with structured data extraction | data extraction, content parsing | JsonCssExtractionStrategy(schema), LLMExtractionStrategy(schema) -dynamic_content_loading: Wait for and verify dynamic content loading | content verification, dynamic loading | wait_for="js:() => document.querySelector('.content').innerText.length > 100" \ No newline at end of file diff --git a/.local/llm.txt/12_prefix_based_input.md b/.local/llm.txt/12_prefix_based_input.md deleted file mode 100644 index f9155cbc..00000000 --- a/.local/llm.txt/12_prefix_based_input.md +++ /dev/null @@ -1,158 +0,0 @@ -# Prefix-Based Input Handling in Crawl4AI - -This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example. - -## Crawling a Web URL - -To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig - -async def crawl_web(): - config = CrawlerRunConfig(bypass_cache=True) - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config) - if result.success: - print("Markdown Content:") - print(result.markdown) - else: - print(f"Failed to crawl: {result.error_message}") - -asyncio.run(crawl_web()) -``` - -## Crawling a Local HTML File - -To crawl a local HTML file, prefix the file path with `file://`. - -```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig - -async def crawl_local_file(): - local_file_path = "/path/to/apple.html" # Replace with your file path - file_url = f"file://{local_file_path}" - config = CrawlerRunConfig(bypass_cache=True) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url=file_url, config=config) - if result.success: - print("Markdown Content from Local File:") - print(result.markdown) - else: - print(f"Failed to crawl local file: {result.error_message}") - -asyncio.run(crawl_local_file()) -``` - -## Crawling Raw HTML Content - -To crawl raw HTML content, prefix the HTML string with `raw:`. - -```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig - -async def crawl_raw_html(): - raw_html = "

Hello, World!

" - raw_html_url = f"raw:{raw_html}" - config = CrawlerRunConfig(bypass_cache=True) - - async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url=raw_html_url, config=config) - if result.success: - print("Markdown Content from Raw HTML:") - print(result.markdown) - else: - print(f"Failed to crawl raw HTML: {result.error_message}") - -asyncio.run(crawl_raw_html()) -``` - ---- - -## Complete Example - -Below is a comprehensive script that: - -1. Crawls the Wikipedia page for "Apple." -2. Saves the HTML content to a local file (`apple.html`). -3. Crawls the local HTML file and verifies the markdown length matches the original crawl. -4. Crawls the raw HTML content from the saved file and verifies consistency. - -```python -import os -import sys -import asyncio -from pathlib import Path -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import CrawlerRunConfig - -async def main(): - wikipedia_url = "https://en.wikipedia.org/wiki/apple" - script_dir = Path(__file__).parent - html_file_path = script_dir / "apple.html" - - async with AsyncWebCrawler() as crawler: - # Step 1: Crawl the Web URL - print("\n=== Step 1: Crawling the Wikipedia URL ===") - web_config = CrawlerRunConfig(bypass_cache=True) - result = await crawler.arun(url=wikipedia_url, config=web_config) - - if not result.success: - print(f"Failed to crawl {wikipedia_url}: {result.error_message}") - return - - with open(html_file_path, 'w', encoding='utf-8') as f: - f.write(result.html) - web_crawl_length = len(result.markdown) - print(f"Length of markdown from web crawl: {web_crawl_length}\n") - - # Step 2: Crawl from the Local HTML File - print("=== Step 2: Crawling from the Local HTML File ===") - file_url = f"file://{html_file_path.resolve()}" - file_config = CrawlerRunConfig(bypass_cache=True) - local_result = await crawler.arun(url=file_url, config=file_config) - - if not local_result.success: - print(f"Failed to crawl local file {file_url}: {local_result.error_message}") - return - - local_crawl_length = len(local_result.markdown) - assert web_crawl_length == local_crawl_length, "Markdown length mismatch" - print("✅ Markdown length matches between web and local file crawl.\n") - - # Step 3: Crawl Using Raw HTML Content - print("=== Step 3: Crawling Using Raw HTML Content ===") - with open(html_file_path, 'r', encoding='utf-8') as f: - raw_html_content = f.read() - raw_html_url = f"raw:{raw_html_content}" - raw_config = CrawlerRunConfig(bypass_cache=True) - raw_result = await crawler.arun(url=raw_html_url, config=raw_config) - - if not raw_result.success: - print(f"Failed to crawl raw HTML content: {raw_result.error_message}") - return - - raw_crawl_length = len(raw_result.markdown) - assert web_crawl_length == raw_crawl_length, "Markdown length mismatch" - print("✅ Markdown length matches between web and raw HTML crawl.\n") - - print("All tests passed successfully!") - if html_file_path.exists(): - os.remove(html_file_path) - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -## Conclusion - -With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios. \ No newline at end of file diff --git a/.local/llm.txt/12_prefix_based_input.q.md b/.local/llm.txt/12_prefix_based_input.q.md deleted file mode 100644 index 7ff392bd..00000000 --- a/.local/llm.txt/12_prefix_based_input.q.md +++ /dev/null @@ -1,10 +0,0 @@ -url_prefix_handling: Crawl4AI supports different URL prefixes for various input types | input handling, url format, crawling types | url="https://example.com" or "file://path" or "raw:html" -web_crawling: Crawl live web pages using http:// or https:// prefixes with AsyncWebCrawler | web scraping, url crawling, web content | AsyncWebCrawler().arun(url="https://example.com") -local_file_crawling: Access local HTML files using file:// prefix for crawling | local html, file crawling, file access | AsyncWebCrawler().arun(url="file:///path/to/file.html") -raw_html_crawling: Process raw HTML content directly using raw: prefix | html string, raw content, direct html | AsyncWebCrawler().arun(url="raw:content") -crawler_config: Configure crawling behavior using CrawlerRunConfig object | crawler settings, configuration, bypass cache | CrawlerRunConfig(bypass_cache=True) -async_context: AsyncWebCrawler should be used within async context manager | async with, context management, async programming | async with AsyncWebCrawler() as crawler -crawl_result: Crawler returns result object containing success status, markdown and error messages | response handling, crawl output, result parsing | result.success, result.markdown, result.error_message -html_to_markdown: Crawler automatically converts HTML content to markdown format | format conversion, markdown generation, content processing | result.markdown -error_handling: Check crawl success status and handle error messages appropriately | error checking, failure handling, status verification | if result.success: ... else: print(result.error_message) -content_verification: Compare markdown length between different crawling methods for consistency | content validation, length comparison, consistency check | assert web_crawl_length == local_crawl_length \ No newline at end of file diff --git a/.local/llm.txt/13_hooks_auth.md b/.local/llm.txt/13_hooks_auth.md deleted file mode 100644 index 89258550..00000000 --- a/.local/llm.txt/13_hooks_auth.md +++ /dev/null @@ -1,119 +0,0 @@ -# Hooks & Auth for AsyncWebCrawler - -Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`. - -In this example, we'll: - -1. Configure the browser and set up authentication when it's created. -2. Apply custom routing and initial actions when the page context is created. -3. Add custom headers before navigating to the URL. -4. Log the current URL after navigation. -5. Perform actions after JavaScript execution. -6. Log the length of the HTML before returning it. - -## Hook Definitions - -```python -import asyncio -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig -from playwright.async_api import Page, Browser, BrowserContext - -def log_routing(route): - # Example: block loading images - if route.request.resource_type == "image": - print(f"[HOOK] Blocking image request: {route.request.url}") - asyncio.create_task(route.abort()) - else: - asyncio.create_task(route.continue_()) - -async def on_browser_created(browser: Browser, **kwargs): - print("[HOOK] on_browser_created") - # Example: Set browser viewport size and log in - context = await browser.new_context(viewport={"width": 1920, "height": 1080}) - page = await context.new_page() - await page.goto("https://example.com/login") - await page.fill("input[name='username']", "testuser") - await page.fill("input[name='password']", "password123") - await page.click("button[type='submit']") - await page.wait_for_selector("#welcome") - await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}]) - await page.close() - await context.close() - -async def on_page_context_created(context: BrowserContext, page: Page, **kwargs): - print("[HOOK] on_page_context_created") - await context.route("**", log_routing) - -async def before_goto(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] before_goto") - await page.set_extra_http_headers({"X-Test-Header": "test"}) - -async def after_goto(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] after_goto") - print(f"Current URL: {page.url}") - -async def on_execution_started(page: Page, context: BrowserContext, **kwargs): - print("[HOOK] on_execution_started") - await page.evaluate("console.log('Custom JS executed')") - -async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs): - print("[HOOK] before_return_html") - print(f"HTML length: {len(html)}") - return page -``` - -## Using the Hooks with AsyncWebCrawler - -```python -async def main(): - print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!") - - # Configure browser and crawler settings - browser_config = BrowserConfig( - headless=True, - viewport_width=1920, - viewport_height=1080 - ) - - crawler_run_config = CrawlerRunConfig( - js_code="window.scrollTo(0, document.body.scrollHeight);", - wait_for="footer" - ) - - # Initialize crawler - async with AsyncWebCrawler(config=browser_config) as crawler: - crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) - crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) - crawler.crawler_strategy.set_hook("before_goto", before_goto) - crawler.crawler_strategy.set_hook("after_goto", after_goto) - crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started) - crawler.crawler_strategy.set_hook("before_return_html", before_return_html) - - # Run the crawler - result = await crawler.arun(url="https://example.com", config=crawler_run_config) - - print("\n📦 Crawler Hooks Result:") - print(result) - -asyncio.run(main()) -``` - -## Explanation of Hooks - -- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies). -- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL. -- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions. -- **`after_goto`**: Called after navigation. Use this to verify content or log the URL. -- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions. -- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content. - -## Additional Customizations - -- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts). -- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL. -- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens. -- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content. - -These hooks provide powerful customization options for tailoring the crawling process to your needs. - diff --git a/.local/llm.txt/13_hooks_auth.q.md b/.local/llm.txt/13_hooks_auth.q.md deleted file mode 100644 index c269b9fc..00000000 --- a/.local/llm.txt/13_hooks_auth.q.md +++ /dev/null @@ -1,12 +0,0 @@ -crawler_hooks: AsyncWebCrawler supports customizable hooks for modifying crawler behavior | hooks, async functions, crawler customization | crawler.crawler_strategy.set_hook() -browser_creation_hook: on_browser_created hook executes when browser is initialized for authentication and setup | browser setup, login, authentication | async def on_browser_created(browser: Browser, **kwargs) -page_context_hook: on_page_context_created hook handles routing and initial page setup | page context, routing, resource blocking | async def on_page_context_created(context: BrowserContext, page: Page, **kwargs) -navigation_pre_hook: before_goto hook allows adding custom headers before URL navigation | headers, pre-navigation, request modification | async def before_goto(page: Page, context: BrowserContext, **kwargs) -navigation_post_hook: after_goto hook executes after URL navigation for verification | post-navigation, URL logging | async def after_goto(page: Page, context: BrowserContext, **kwargs) -js_execution_hook: on_execution_started hook runs after custom JavaScript execution | JavaScript, script execution | async def on_execution_started(page: Page, context: BrowserContext, **kwargs) -html_processing_hook: before_return_html hook processes HTML content before returning | HTML content, preprocessing | async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs) -browser_configuration: BrowserConfig allows setting headless mode and viewport dimensions | browser settings, viewport | BrowserConfig(headless=True, viewport_width=1920, viewport_height=1080) -crawler_configuration: CrawlerRunConfig defines JavaScript execution and wait conditions | crawler settings, JS code, wait conditions | CrawlerRunConfig(js_code="window.scrollTo(0)", wait_for="footer") -resource_management: Route handlers can block or modify specific resource types | resource blocking, request handling | if route.request.resource_type == "image": await route.abort() -authentication_flow: Browser authentication handled through login form interaction and cookie setting | login process, cookies | await page.fill("input[name='username']", "testuser") -hook_registration: Hooks are registered using the crawler strategy's set_hook method | hook setup, strategy | crawler.crawler_strategy.set_hook("hook_name", hook_function) \ No newline at end of file diff --git a/.local/llm.txt/14_proxy_security.md b/.local/llm.txt/14_proxy_security.md deleted file mode 100644 index 5d46726e..00000000 --- a/.local/llm.txt/14_proxy_security.md +++ /dev/null @@ -1,131 +0,0 @@ -# Proxy & Security - -Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction. - -## Basic Proxy Setup - -Simple proxy configuration with `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig - -# Using proxy URL -browser_config = BrowserConfig(proxy="http://proxy.example.com:8080") -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") - -# Using SOCKS proxy -browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080") -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Authenticated Proxy - -Use an authenticated proxy with `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig - -proxy_config = { - "server": "http://proxy.example.com:8080", - "username": "user", - "password": "pass" -} - -browser_config = BrowserConfig(proxy_config=proxy_config) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Rotating Proxies - -Example using a proxy rotation service and updating `BrowserConfig` dynamically: - -```python -from crawl4ai.async_configs import BrowserConfig - -async def get_next_proxy(): - # Your proxy rotation logic here - return {"server": "http://next.proxy.com:8080"} - -browser_config = BrowserConfig() -async with AsyncWebCrawler(config=browser_config) as crawler: - # Update proxy for each request - for url in urls: - proxy = await get_next_proxy() - browser_config.proxy_config = proxy - result = await crawler.arun(url=url, config=browser_config) -``` - -## Custom Headers - -Add security-related headers via `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig - -headers = { - "X-Forwarded-For": "203.0.113.195", - "Accept-Language": "en-US,en;q=0.9", - "Cache-Control": "no-cache", - "Pragma": "no-cache" -} - -browser_config = BrowserConfig(headers=headers) -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com") -``` - -## Combining with Magic Mode - -For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`: - -```python -from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig - -browser_config = BrowserConfig( - proxy="http://proxy.example.com:8080", - headers={"Accept-Language": "en-US"} -) -crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features - -async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url="https://example.com", config=crawler_config) -``` - -## SSL Certificate Verification - -Crawl4AI can retrieve and analyze SSL certificates from HTTPS websites. This is useful for: -- Verifying website authenticity -- Detecting potential security issues -- Analyzing certificate chains -- Exporting certificates for further analysis - -Enable SSL certificate retrieval with `CrawlerRunConfig`: - -```python -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - -config = CrawlerRunConfig(fetch_ssl_certificate=True) -async with AsyncWebCrawler() as crawler: - result = await crawler.arun(url="https://example.com", config=config) - - if result.success and result.ssl_certificate: - cert = result.ssl_certificate - - # Access certificate properties - print(f"Issuer: {cert.issuer.get('CN', '')}") - print(f"Valid until: {cert.valid_until}") - print(f"Fingerprint: {cert.fingerprint}") - - # Export certificate in different formats - cert.to_json("cert.json") # For analysis - cert.to_pem("cert.pem") # For web servers - cert.to_der("cert.der") # For Java applications -``` - -The SSL certificate object provides: -- Direct access to certificate fields (issuer, subject, validity dates) -- Methods to export in common formats (JSON, PEM, DER) -- Certificate chain information and extensions diff --git a/.local/llm.txt/14_proxy_security.q.md b/.local/llm.txt/14_proxy_security.q.md deleted file mode 100644 index 1489d277..00000000 --- a/.local/llm.txt/14_proxy_security.q.md +++ /dev/null @@ -1,8 +0,0 @@ -proxy_setup: Configure basic proxy in Crawl4AI using BrowserConfig with proxy URL | proxy configuration, proxy setup, basic proxy | BrowserConfig(proxy="http://proxy.example.com:8080") -socks_proxy: Use SOCKS proxy protocol for web crawling | SOCKS5, proxy protocol, SOCKS connection | BrowserConfig(proxy="socks5://proxy.example.com:1080") -authenticated_proxy: Set up proxy with username and password authentication | proxy auth, proxy credentials, authenticated connection | BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080", "username": "user", "password": "pass"}) -rotating_proxies: Implement dynamic proxy rotation during crawling | proxy rotation, proxy switching, dynamic proxies | browser_config.proxy_config = await get_next_proxy() -custom_headers: Add security headers to browser configuration for enhanced protection | HTTP headers, request headers, security headers | BrowserConfig(headers={"X-Forwarded-For": "203.0.113.195", "Accept-Language": "en-US,en;q=0.9"}) -magic_mode: Combine proxy settings with Magic Mode for maximum anti-detection | anti-detection, stealth mode, protection features | CrawlerRunConfig(magic=True) with BrowserConfig(proxy="http://proxy.example.com:8080") -crawler_context: Use AsyncWebCrawler with async context manager for proper resource management | async crawler, context manager, crawler setup | async with AsyncWebCrawler(config=browser_config) as crawler -cache_control: Set cache control headers to prevent caching during crawling | caching headers, no-cache, cache prevention | BrowserConfig(headers={"Cache-Control": "no-cache", "Pragma": "no-cache"}) \ No newline at end of file diff --git a/.local/llm.txt/15_screenshot_and_pdf_export.md b/.local/llm.txt/15_screenshot_and_pdf_export.md deleted file mode 100644 index 4dcc3ff1..00000000 --- a/.local/llm.txt/15_screenshot_and_pdf_export.md +++ /dev/null @@ -1,58 +0,0 @@ -# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI - -When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences. - -## **The New Approach:** -We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic. - -## **Key Benefits:** -- **Reliability:** The PDF export never times out and works regardless of page length. -- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing. -- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime. - -## **Simple Example:** -```python -import os, sys -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode - -# Adjust paths as needed -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) - -async def main(): - async with AsyncWebCrawler() as crawler: - # Request both PDF and screenshot - result = await crawler.arun( - url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', - cache_mode=CacheMode.BYPASS, - pdf=True, - screenshot=True - ) - - if result.success: - # Save screenshot - if result.screenshot: - from base64 import b64decode - with open(os.path.join(__location__, "screenshot.png"), "wb") as f: - f.write(b64decode(result.screenshot)) - - # Save PDF - if result.pdf: - pdf_bytes = b64decode(result.pdf) - with open(os.path.join(__location__, "page.pdf"), "wb") as f: - f.write(pdf_bytes) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## **What Happens Under the Hood:** -- Crawl4AI navigates to the target page. -- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length. -- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling. -- Finally, you get your PDF and/or screenshot ready to use. - -## **Conclusion:** -With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages. \ No newline at end of file diff --git a/.local/llm.txt/15_screenshot_and_pdf_export.q.md b/.local/llm.txt/15_screenshot_and_pdf_export.q.md deleted file mode 100644 index 04e466d7..00000000 --- a/.local/llm.txt/15_screenshot_and_pdf_export.q.md +++ /dev/null @@ -1,9 +0,0 @@ -page_capture: Full-page screenshots and PDFs can be generated for massive webpages using Crawl4AI | webpage capture, full page screenshot, pdf export | AsyncWebCrawler().arun(url=url, pdf=True, screenshot=True) -pdf_approach: Pages are first exported as PDF then converted to high-quality images for better handling of large content | pdf conversion, image export, page rendering | result.pdf, result.screenshot -export_benefits: PDF export method never times out and works with any page length | timeout handling, page size limits, reliability | pdf=True -dual_output: Get both PDF and screenshot in single crawl without reloading | multiple formats, single pass, efficient capture | pdf=True, screenshot=True -result_handling: Screenshot and PDF data are returned as base64 encoded strings | base64 encoding, binary data, file saving | b64decode(result.screenshot), b64decode(result.pdf) -cache_control: Cache mode can be bypassed for fresh page captures | caching, fresh content, bypass cache | cache_mode=CacheMode.BYPASS -async_operation: Crawler operates asynchronously using Python's asyncio framework | async/await, concurrent execution | async with AsyncWebCrawler() as crawler -file_saving: Screenshots and PDFs can be saved directly to local files | file output, save results, local storage | open("screenshot.png", "wb"), open("page.pdf", "wb") -error_handling: Success status can be checked before processing results | error checking, result validation | if result.success: \ No newline at end of file diff --git a/.local/llm.txt/16_storage_state.md b/.local/llm.txt/16_storage_state.md deleted file mode 100644 index 55858c4d..00000000 --- a/.local/llm.txt/16_storage_state.md +++ /dev/null @@ -1,225 +0,0 @@ -# Using `storage_state` to Pre-Load Cookies and LocalStorage - -Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time. - -## What is `storage_state`? - -`storage_state` can be: - -- A dictionary containing cookies and localStorage data. -- A path to a JSON file that holds this information. - -When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state. - -## Example Structure - -Here’s an example storage state: - -```json -{ - "cookies": [ - { - "name": "session", - "value": "abcd1234", - "domain": "example.com", - "path": "/", - "expires": 1675363572.037711, - "httpOnly": false, - "secure": false, - "sameSite": "None" - } - ], - "origins": [ - { - "origin": "https://example.com", - "localStorage": [ - { "name": "token", "value": "my_auth_token" }, - { "name": "refreshToken", "value": "my_refresh_token" } - ] - } - ] -} -``` - -This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`. - ---- - -## Passing `storage_state` as a Dictionary - -You can directly provide the data as a dictionary: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - storage_dict = { - "cookies": [ - { - "name": "session", - "value": "abcd1234", - "domain": "example.com", - "path": "/", - "expires": 1675363572.037711, - "httpOnly": False, - "secure": False, - "sameSite": "None" - } - ], - "origins": [ - { - "origin": "https://example.com", - "localStorage": [ - {"name": "token", "value": "my_auth_token"}, - {"name": "refreshToken", "value": "my_refresh_token"} - ] - } - ] - } - - async with AsyncWebCrawler( - headless=True, - storage_state=storage_dict - ) as crawler: - result = await crawler.arun(url='https://example.com/protected') - if result.success: - print("Crawl succeeded with pre-loaded session data!") - print("Page HTML length:", len(result.html)) - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -## Passing `storage_state` as a File - -If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - async with AsyncWebCrawler( - headless=True, - storage_state="mystate.json" # Uses a JSON file instead of a dictionary - ) as crawler: - result = await crawler.arun(url='https://example.com/protected') - if result.success: - print("Crawl succeeded with pre-loaded session data!") - print("Page HTML length:", len(result.html)) - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -## Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later) - -A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can: - -1. Perform the login once in a hook. -2. After login completes, export the resulting `storage_state` to a file. -3. On subsequent runs, provide that `storage_state` to skip the login step. - -**Step-by-Step Example:** - -**First Run (Perform Login and Save State):** - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -async def on_browser_created_hook(browser): - # Access the default context and create a page - context = browser.contexts[0] - page = await context.new_page() - - # Navigate to the login page - await page.goto("https://example.com/login", wait_until="domcontentloaded") - - # Fill in credentials and submit - await page.fill("input[name='username']", "myuser") - await page.fill("input[name='password']", "mypassword") - await page.click("button[type='submit']") - await page.wait_for_load_state("networkidle") - - # Now the site sets tokens in localStorage and cookies - # Export this state to a file so we can reuse it - await context.storage_state(path="my_storage_state.json") - await page.close() - -async def main(): - # First run: perform login and export the storage_state - async with AsyncWebCrawler( - headless=True, - verbose=True, - hooks={"on_browser_created": on_browser_created_hook}, - use_persistent_context=True, - user_data_dir="./my_user_data" - ) as crawler: - - # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json - result = await crawler.arun( - url='https://example.com/protected-page', - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("First run result success:", result.success) - if result.success: - print("Protected page HTML length:", len(result.html)) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**Second Run (Reuse Saved State, No Login Needed):** - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator - -async def main(): - # Second run: no need to hook on_browser_created this time. - # Just provide the previously saved storage state. - async with AsyncWebCrawler( - headless=True, - verbose=True, - use_persistent_context=True, - user_data_dir="./my_user_data", - storage_state="my_storage_state.json" # Reuse previously exported state - ) as crawler: - - # Now the crawler starts already logged in - result = await crawler.arun( - url='https://example.com/protected-page', - cache_mode=CacheMode.BYPASS, - markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}), - ) - print("Second run result success:", result.success) - if result.success: - print("Protected page HTML length:", len(result.html)) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**What’s Happening Here?** - -- During the first run, the `on_browser_created_hook` logs into the site. -- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`. -- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps. - -**Sign Out Scenario:** -If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time. - ---- - -## Conclusion - -By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines. \ No newline at end of file diff --git a/.local/llm.txt/17_crawl_config.md b/.local/llm.txt/17_crawl_config.md deleted file mode 100644 index 928ae1e2..00000000 --- a/.local/llm.txt/17_crawl_config.md +++ /dev/null @@ -1,85 +0,0 @@ -# CrawlerRunConfig Parameters Documentation - -## Content Processing Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content | -| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy | -| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction | -| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content | -| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content | -| `only_text` | bool | False | If True, attempt to extract text-only content where applicable | -| `css_selector` | str | None | CSS selector to extract a specific portion of the page | -| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing | -| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes | -| `remove_forms` | bool | False | If True, remove all `
` elements from the HTML | -| `prettiify` | bool | False | If True, apply `fast_format_html` to produce prettified HTML output | - -## Caching Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `cache_mode` | CacheMode | None | Defines how caching is handled. Defaults to CacheMode.ENABLED internally | -| `session_id` | str | None | Optional session ID to persist browser context and page instance | -| `bypass_cache` | bool | False | Legacy parameter, if True acts like CacheMode.BYPASS | -| `disable_cache` | bool | False | Legacy parameter, if True acts like CacheMode.DISABLED | -| `no_cache_read` | bool | False | Legacy parameter, if True acts like CacheMode.WRITE_ONLY | -| `no_cache_write` | bool | False | Legacy parameter, if True acts like CacheMode.READ_ONLY | - -## Page Navigation and Timing Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `wait_until` | str | "domcontentloaded" | The condition to wait for when navigating | -| `page_timeout` | int | 60000 | Timeout in milliseconds for page operations like navigation | -| `wait_for` | str | None | CSS selector or JS condition to wait for before extracting content | -| `wait_for_images` | bool | True | If True, wait for images to load before extracting content | -| `delay_before_return_html` | float | 0.1 | Delay in seconds before retrieving final HTML | -| `mean_delay` | float | 0.1 | Mean base delay between requests when calling arun_many | -| `max_range` | float | 0.3 | Max random additional delay range for requests in arun_many | -| `semaphore_count` | int | 5 | Number of concurrent operations allowed | - -## Page Interaction Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `js_code` | str or list[str] | None | JavaScript code/snippets to run on the page | -| `js_only` | bool | False | If True, indicates subsequent calls are JS-driven updates | -| `ignore_body_visibility` | bool | True | If True, ignore whether the body is visible before proceeding | -| `scan_full_page` | bool | False | If True, scroll through the entire page to load all content | -| `scroll_delay` | float | 0.2 | Delay in seconds between scroll steps if scan_full_page is True | -| `process_iframes` | bool | False | If True, attempts to process and inline iframe content | -| `remove_overlay_elements` | bool | False | If True, remove overlays/popups before extracting HTML | -| `simulate_user` | bool | False | If True, simulate user interactions for anti-bot measures | -| `override_navigator` | bool | False | If True, overrides navigator properties for more human-like behavior | -| `magic` | bool | False | If True, attempts automatic handling of overlays/popups | -| `adjust_viewport_to_content` | bool | False | If True, adjust viewport according to page content dimensions | - -## Media Handling Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `screenshot` | bool | False | Whether to take a screenshot after crawling | -| `screenshot_wait_for` | float | None | Additional wait time before taking a screenshot | -| `screenshot_height_threshold` | int | 20000 | Threshold for page height to decide screenshot strategy | -| `pdf` | bool | False | Whether to generate a PDF of the page | -| `image_description_min_word_threshold` | int | 50 | Minimum words for image description extraction | -| `image_score_threshold` | int | 3 | Minimum score threshold for processing an image | -| `exclude_external_images` | bool | False | If True, exclude all external images from processing | - -## Link and Domain Handling Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `exclude_social_media_domains` | list[str] | SOCIAL_MEDIA_DOMAINS | List of domains to exclude for social media links | -| `exclude_external_links` | bool | False | If True, exclude all external links from the results | -| `exclude_social_media_links` | bool | False | If True, exclude links pointing to social media domains | -| `exclude_domains` | list[str] | [] | List of specific domains to exclude from results | - -## Debugging and Logging Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `verbose` | bool | True | Enable verbose logging | -| `log_console` | bool | False | If True, log console messages from the page | \ No newline at end of file diff --git a/.local/llm.txt/1_introduction.xs.md b/.local/llm.txt/1_introduction.xs.md deleted file mode 100644 index 46cd5d44..00000000 --- a/.local/llm.txt/1_introduction.xs.md +++ /dev/null @@ -1,112 +0,0 @@ -# Crawl4AI LLM Reference - -> Minimal, code-focused reference for LLM-based retrieval and answer generation. - -Intended usage: A language model trained on this document can provide quick answers to developers integrating Crawl4AI. - -## Installation - -- Basic: -```bash -pip install crawl4ai -crawl4ai-setup -``` - -- If necessary: -```bash -playwright install chromium -``` - -## Basic Usage - -- Asynchronous crawl: -```python -import asyncio -from crawl4ai import AsyncWebCrawler - -async def main(): - async with AsyncWebCrawler(verbose=True) as c: - r = await c.arun(url="https://example.com") - print(r.markdown) - -asyncio.run(main()) -``` - -## Concurrent Crawling - -- Multiple URLs: -```python -urls = ["https://example.com/page1", "https://example.com/page2"] -async with AsyncWebCrawler() as c: - results = await asyncio.gather(*[c.arun(url=u) for u in urls]) -``` - -## Configuration - -- CacheMode: -```python -from crawl4ai import CacheMode -r = await c.arun(url="...", cache_mode=CacheMode.ENABLED) -``` - -- Proxies: -```python -async with AsyncWebCrawler(proxies={"http": "http://user:pass@proxy:port"}) as c: - r = await c.arun("https://example.com") -``` - -- Headers & Viewport: -```python -async with AsyncWebCrawler(headers={"User-Agent": "MyUA"}, viewport={"width":1024,"height":768}) as c: - r = await c.arun("https://example.com") -``` - -## JavaScript Injection - -- Custom JS: -```python -js_code = [""" -(async () => { - const btn = document.querySelector('#load-more'); - if (btn) btn.click(); - await new Promise(r => setTimeout(r, 1000)); -})(); -"""] - -r = await c.arun(url="...", js_code=js_code) -``` - -## Extraction Strategies - -- JSON CSS Extraction: -```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -schema = {...} -r = await c.arun(url="...", extraction_strategy=JsonCssExtractionStrategy(schema)) -``` - -- LLM Extraction: -```python -from crawl4ai.extraction_strategy import LLMExtractionStrategy - -r = await c.arun(url="...", - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token="YOUR_API_KEY", - schema={...}, - extraction_type="schema" - ) -) -``` - -## Common Issues - -- Playwright errors: `playwright install chromium` -- Empty output: Increase wait or use `js_code`. -- SSL issues: Check certificates or use `verify_ssl=False` (not recommended for production). - -## Additional Links - -- [GitHub Repository](https://github.com/unclecode/crawl4ai) -- [Documentation](https://crawl4ai.com/mkdocs/) \ No newline at end of file diff --git a/.local/llm.txt/2_configuration.md b/.local/llm.txt/2_configuration.md deleted file mode 100644 index 2ec5cb8f..00000000 --- a/.local/llm.txt/2_configuration.md +++ /dev/null @@ -1,390 +0,0 @@ -# Core Configurations - -## BrowserConfig -`BrowserConfig` centralizes all parameters required to set up and manage a browser instance and its context. This configuration ensures consistent and documented browser behavior for the crawler. Below is a detailed explanation of each parameter and its optimal use cases. - -### Parameters and Use Cases - -#### `browser_type` -- **Description**: Specifies the type of browser to launch. - - Supported values: `"chromium"`, `"firefox"`, `"webkit"` - - Default: `"chromium"` -- **Use Case**: - - Use `"chromium"` for general-purpose crawling with modern web standards. - - Use `"firefox"` when testing against Firefox-specific behavior. - - Use `"webkit"` for testing Safari-like environments. - -#### `headless` -- **Description**: Determines whether the browser runs in headless mode (no GUI). - - Default: `True` -- **Use Case**: - - Enable for faster, automated operations without UI overhead. - - Disable (`False`) when debugging or inspecting browser behavior visually. - -#### `use_managed_browser` -- **Description**: Enables advanced manipulation via a managed browser approach. - - Default: `False` -- **Use Case**: - - Use when fine-grained control is needed over browser sessions, such as debugging network requests or reusing sessions. - -#### `debugging_port` -- **Description**: Port for remote debugging. - - Default: 9222 -- **Use Case**: - - Use for debugging browser sessions with DevTools or external tools. - -#### `use_persistent_context` -- **Description**: Uses a persistent browser context (e.g., saved profiles). - - Automatically enables `use_managed_browser`. - - Default: `False` -- **Use Case**: - - Persistent login sessions for authenticated crawling. - - Retaining cookies or local storage across multiple runs. - -#### `user_data_dir` -- **Description**: Path to a directory for storing persistent browser data. - - Default: `None` -- **Use Case**: - - Specify a directory to save browser profiles for multi-run crawls or debugging. - -#### `chrome_channel` -- **Description**: Specifies the Chrome channel to launch (e.g., `"chrome"`, `"msedge"`). - - Applies only when `browser_type` is `"chromium"`. - - Default: `"chrome"` -- **Use Case**: - - Use `"msedge"` for compatibility testing with Edge browsers. - -#### `proxy` and `proxy_config` -- **Description**: - - `proxy`: Proxy server URL for the browser. - - `proxy_config`: Detailed proxy configuration. - - Default: `None` -- **Use Case**: - - Set `proxy` for single-proxy setups. - - Use `proxy_config` for advanced configurations, such as authenticated proxies or regional routing. - -#### `viewport_width` and `viewport_height` -- **Description**: Sets the default browser viewport dimensions. - - Default: `1080` (width), `600` (height) -- **Use Case**: - - Adjust for crawling responsive layouts or specific device emulations. - -#### `accept_downloads` and `downloads_path` -- **Description**: - - `accept_downloads`: Allows file downloads. - - `downloads_path`: Directory for storing downloads. - - Default: `False`, `None` -- **Use Case**: - - Use when downloading and analyzing files like PDFs or spreadsheets. - -#### `storage_state` -- **Description**: Specifies cookies and local storage state. - - Default: `None` -- **Use Case**: - - Provide state data for authenticated or preconfigured sessions. - -#### `ignore_https_errors` -- **Description**: Ignores HTTPS certificate errors. - - Default: `True` -- **Use Case**: - - Enable for crawling sites with invalid certificates (testing environments). - -#### `java_script_enabled` -- **Description**: Toggles JavaScript execution in pages. - - Default: `True` -- **Use Case**: - - Disable for simpler, faster crawls where JavaScript is unnecessary. - -#### `cookies` -- **Description**: List of cookies to add to the browser context. - - Default: `[]` -- **Use Case**: - - Use for authenticated or preconfigured crawling scenarios. - -#### `headers` -- **Description**: Extra HTTP headers applied to all requests. - - Default: `{}` -- **Use Case**: - - Customize headers for API-like crawling or bypassing bot detections. - -#### `user_agent` and `user_agent_mode` -- **Description**: - - `user_agent`: Custom User-Agent string. - - `user_agent_mode`: Mode for generating User-Agent (e.g., `"random"`). - - Default: Standard Chromium-based User-Agent. -- **Use Case**: - - Set static User-Agent for consistent identification. - - Use `"random"` mode to reduce bot detection likelihood. - -#### `text_mode` -- **Description**: Disables images and other rich content for faster load times. - - Default: `False` -- **Use Case**: - - Enable for text-only extraction tasks where speed is prioritized. - -#### `light_mode` -- **Description**: Disables background features for performance gains. - - Default: `False` -- **Use Case**: - - Enable for high-performance crawls on resource-constrained environments. - -#### `extra_args` -- **Description**: Additional command-line arguments for browser execution. - - Default: `[]` -- **Use Case**: - - Use for advanced browser configurations like WebRTC or GPU tuning. - -#### `verbose` -- **Description**: Enable verbose logging of browser operations. - - Default: `True` -- **Use Case**: - - Enable for detailed logging during development and debugging. - - Disable in production for better performance. - -#### `sleep_on_close` -- **Description**: Adds a delay before closing the browser. - - Default: `False` -- **Use Case**: - - Enable when you need to ensure all browser operations are complete before closing. - -## CrawlerRunConfig -The `CrawlerRunConfig` class centralizes parameters for controlling crawl operations. This configuration covers content extraction, page interactions, caching, and runtime behaviors. Below is an exhaustive breakdown of parameters and their best-use scenarios. - -### Parameters and Use Cases - -#### Content Processing Parameters - -##### `word_count_threshold` -- **Description**: Minimum word count threshold for processing content. - - Default: `200` -- **Use Case**: - - Set a higher threshold for content-heavy pages to skip lightweight or irrelevant content. - -##### `extraction_strategy` -- **Description**: Strategy for extracting structured data from crawled pages. - - Default: `None` (uses `NoExtractionStrategy` by default). -- **Use Case**: - - Use for schema-driven extraction when working with well-defined data models like JSON. - -##### `chunking_strategy` -- **Description**: Strategy to chunk content before extraction. - - Default: `RegexChunking()`. -- **Use Case**: - - Use NLP-based chunking for semantic extractions or regex for predictable text blocks. - -##### `markdown_generator` -- **Description**: Strategy for generating Markdown output. - - Default: `None`. -- **Use Case**: - - Use custom Markdown strategies for AI-ready outputs like RAG pipelines. - -##### `content_filter` -- **Description**: Optional filter to prune irrelevant content. - - Default: `None`. -- **Use Case**: - - Use relevance-based filters for focused crawls, e.g., keyword-specific searches. - -##### `only_text` -- **Description**: Extracts text-only content where applicable. - - Default: `False`. -- **Use Case**: - - Enable for extracting clean text without HTML tags or rich content. - -##### `css_selector` -- **Description**: CSS selector to extract a specific portion of the page. - - Default: `None`. -- **Use Case**: - - Use when targeting specific page elements, like articles or headlines. - -##### `excluded_tags` -- **Description**: List of HTML tags to exclude from processing. - - Default: `None`. -- **Use Case**: - - Remove elements like `