diff --git a/.gitignore b/.gitignore
index 012e78cb..d485815c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -206,7 +206,6 @@ pypi_build.sh
git_issues.py
git_issues.md
-.local/
.next/
.tests/
.issues/
diff --git a/.local/.chainlit/config.toml b/.local/.chainlit/config.toml
new file mode 100644
index 00000000..810b06f3
--- /dev/null
+++ b/.local/.chainlit/config.toml
@@ -0,0 +1,121 @@
+[project]
+# Whether to enable telemetry (default: true). No personal data is collected.
+enable_telemetry = true
+
+
+# List of environment variables to be provided by each user to use the app.
+user_env = []
+
+# Duration (in seconds) during which the session is saved when the connection is lost
+session_timeout = 3600
+
+# Enable third parties caching (e.g LangChain cache)
+cache = false
+
+# Authorized origins
+allow_origins = ["*"]
+
+# Follow symlink for asset mount (see https://github.com/Chainlit/chainlit/issues/317)
+# follow_symlink = false
+
+[features]
+# Show the prompt playground
+prompt_playground = true
+
+# Process and display HTML in messages. This can be a security risk (see https://stackoverflow.com/questions/19603097/why-is-it-dangerous-to-render-user-generated-html-or-javascript)
+unsafe_allow_html = false
+
+# Process and display mathematical expressions. This can clash with "$" characters in messages.
+latex = false
+
+# Automatically tag threads with the current chat profile (if a chat profile is used)
+auto_tag_thread = true
+
+# Authorize users to spontaneously upload files with messages
+[features.spontaneous_file_upload]
+ enabled = true
+ accept = ["*/*"]
+ max_files = 20
+ max_size_mb = 500
+
+[features.audio]
+ # Threshold for audio recording
+ min_decibels = -45
+ # Delay for the user to start speaking in MS
+ initial_silence_timeout = 3000
+ # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
+ silence_timeout = 1500
+ # Above this duration (MS), the recording will forcefully stop.
+ max_duration = 15000
+ # Duration of the audio chunks in MS
+ chunk_duration = 1000
+ # Sample rate of the audio
+ sample_rate = 44100
+
+[UI]
+# Name of the app and chatbot.
+name = "Chatbot"
+
+# Show the readme while the thread is empty.
+show_readme_as_default = true
+
+# Description of the app and chatbot. This is used for HTML tags.
+# description = ""
+
+# Large size content are by default collapsed for a cleaner ui
+default_collapse_content = true
+
+# The default value for the expand messages settings.
+default_expand_messages = false
+
+# Hide the chain of thought details from the user in the UI.
+hide_cot = false
+
+# Link to your github repo. This will add a github button in the UI's header.
+# github = ""
+
+# Specify a CSS file that can be used to customize the user interface.
+# The CSS file can be served from the public directory or via an external link.
+# custom_css = "/public/test.css"
+
+# Specify a Javascript file that can be used to customize the user interface.
+# The Javascript file can be served from the public directory.
+# custom_js = "/public/test.js"
+
+# Specify a custom font url.
+# custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
+
+# Specify a custom meta image url.
+# custom_meta_image_url = "https://chainlit-cloud.s3.eu-west-3.amazonaws.com/logo/chainlit_banner.png"
+
+# Specify a custom build directory for the frontend.
+# This can be used to customize the frontend code.
+# Be careful: If this is a relative path, it should not start with a slash.
+# custom_build = "./public/build"
+
+[UI.theme]
+ #layout = "wide"
+ #font_family = "Inter, sans-serif"
+# Override default MUI light theme. (Check theme.ts)
+[UI.theme.light]
+ #background = "#FAFAFA"
+ #paper = "#FFFFFF"
+
+ [UI.theme.light.primary]
+ #main = "#F80061"
+ #dark = "#980039"
+ #light = "#FFE7EB"
+
+# Override default MUI dark theme. (Check theme.ts)
+[UI.theme.dark]
+ #background = "#FAFAFA"
+ #paper = "#FFFFFF"
+
+ [UI.theme.dark.primary]
+ #main = "#F80061"
+ #dark = "#980039"
+ #light = "#FFE7EB"
+
+
+[meta]
+generated_by = "1.1.202"
diff --git a/.local/.chainlit/translations/en-US.json b/.local/.chainlit/translations/en-US.json
new file mode 100644
index 00000000..0bca7207
--- /dev/null
+++ b/.local/.chainlit/translations/en-US.json
@@ -0,0 +1,231 @@
+{
+ "components": {
+ "atoms": {
+ "buttons": {
+ "userButton": {
+ "menu": {
+ "settings": "Settings",
+ "settingsKey": "S",
+ "APIKeys": "API Keys",
+ "logout": "Logout"
+ }
+ }
+ }
+ },
+ "molecules": {
+ "newChatButton": {
+ "newChat": "New Chat"
+ },
+ "tasklist": {
+ "TaskList": {
+ "title": "\ud83d\uddd2\ufe0f Task List",
+ "loading": "Loading...",
+ "error": "An error occured"
+ }
+ },
+ "attachments": {
+ "cancelUpload": "Cancel upload",
+ "removeAttachment": "Remove attachment"
+ },
+ "newChatDialog": {
+ "createNewChat": "Create new chat?",
+ "clearChat": "This will clear the current messages and start a new chat.",
+ "cancel": "Cancel",
+ "confirm": "Confirm"
+ },
+ "settingsModal": {
+ "settings": "Settings",
+ "expandMessages": "Expand Messages",
+ "hideChainOfThought": "Hide Chain of Thought",
+ "darkMode": "Dark Mode"
+ },
+ "detailsButton": {
+ "using": "Using",
+ "running": "Running",
+ "took_one": "Took {{count}} step",
+ "took_other": "Took {{count}} steps"
+ },
+ "auth": {
+ "authLogin": {
+ "title": "Login to access the app.",
+ "form": {
+ "email": "Email address",
+ "password": "Password",
+ "noAccount": "Don't have an account?",
+ "alreadyHaveAccount": "Already have an account?",
+ "signup": "Sign Up",
+ "signin": "Sign In",
+ "or": "OR",
+ "continue": "Continue",
+ "forgotPassword": "Forgot password?",
+ "passwordMustContain": "Your password must contain:",
+ "emailRequired": "email is a required field",
+ "passwordRequired": "password is a required field"
+ },
+ "error": {
+ "default": "Unable to sign in.",
+ "signin": "Try signing in with a different account.",
+ "oauthsignin": "Try signing in with a different account.",
+ "redirect_uri_mismatch": "The redirect URI is not matching the oauth app configuration.",
+ "oauthcallbackerror": "Try signing in with a different account.",
+ "oauthcreateaccount": "Try signing in with a different account.",
+ "emailcreateaccount": "Try signing in with a different account.",
+ "callback": "Try signing in with a different account.",
+ "oauthaccountnotlinked": "To confirm your identity, sign in with the same account you used originally.",
+ "emailsignin": "The e-mail could not be sent.",
+ "emailverify": "Please verify your email, a new email has been sent.",
+ "credentialssignin": "Sign in failed. Check the details you provided are correct.",
+ "sessionrequired": "Please sign in to access this page."
+ }
+ },
+ "authVerifyEmail": {
+ "almostThere": "You're almost there! We've sent an email to ",
+ "verifyEmailLink": "Please click on the link in that email to complete your signup.",
+ "didNotReceive": "Can't find the email?",
+ "resendEmail": "Resend email",
+ "goBack": "Go Back",
+ "emailSent": "Email sent successfully.",
+ "verifyEmail": "Verify your email address"
+ },
+ "providerButton": {
+ "continue": "Continue with {{provider}}",
+ "signup": "Sign up with {{provider}}"
+ },
+ "authResetPassword": {
+ "newPasswordRequired": "New password is a required field",
+ "passwordsMustMatch": "Passwords must match",
+ "confirmPasswordRequired": "Confirm password is a required field",
+ "newPassword": "New password",
+ "confirmPassword": "Confirm password",
+ "resetPassword": "Reset Password"
+ },
+ "authForgotPassword": {
+ "email": "Email address",
+ "emailRequired": "email is a required field",
+ "emailSent": "Please check the email address {{email}} for instructions to reset your password.",
+ "enterEmail": "Enter your email address and we will send you instructions to reset your password.",
+ "resendEmail": "Resend email",
+ "continue": "Continue",
+ "goBack": "Go Back"
+ }
+ }
+ },
+ "organisms": {
+ "chat": {
+ "history": {
+ "index": {
+ "showHistory": "Show history",
+ "lastInputs": "Last Inputs",
+ "noInputs": "Such empty...",
+ "loading": "Loading..."
+ }
+ },
+ "inputBox": {
+ "input": {
+ "placeholder": "Type your message here..."
+ },
+ "speechButton": {
+ "start": "Start recording",
+ "stop": "Stop recording"
+ },
+ "SubmitButton": {
+ "sendMessage": "Send message",
+ "stopTask": "Stop Task"
+ },
+ "UploadButton": {
+ "attachFiles": "Attach files"
+ },
+ "waterMark": {
+ "text": "Built with"
+ }
+ },
+ "Messages": {
+ "index": {
+ "running": "Running",
+ "executedSuccessfully": "executed successfully",
+ "failed": "failed",
+ "feedbackUpdated": "Feedback updated",
+ "updating": "Updating"
+ }
+ },
+ "dropScreen": {
+ "dropYourFilesHere": "Drop your files here"
+ },
+ "index": {
+ "failedToUpload": "Failed to upload",
+ "cancelledUploadOf": "Cancelled upload of",
+ "couldNotReachServer": "Could not reach the server",
+ "continuingChat": "Continuing previous chat"
+ },
+ "settings": {
+ "settingsPanel": "Settings panel",
+ "reset": "Reset",
+ "cancel": "Cancel",
+ "confirm": "Confirm"
+ }
+ },
+ "threadHistory": {
+ "sidebar": {
+ "filters": {
+ "FeedbackSelect": {
+ "feedbackAll": "Feedback: All",
+ "feedbackPositive": "Feedback: Positive",
+ "feedbackNegative": "Feedback: Negative"
+ },
+ "SearchBar": {
+ "search": "Search"
+ }
+ },
+ "DeleteThreadButton": {
+ "confirmMessage": "This will delete the thread as well as it's messages and elements.",
+ "cancel": "Cancel",
+ "confirm": "Confirm",
+ "deletingChat": "Deleting chat",
+ "chatDeleted": "Chat deleted"
+ },
+ "index": {
+ "pastChats": "Past Chats"
+ },
+ "ThreadList": {
+ "empty": "Empty...",
+ "today": "Today",
+ "yesterday": "Yesterday",
+ "previous7days": "Previous 7 days",
+ "previous30days": "Previous 30 days"
+ },
+ "TriggerButton": {
+ "closeSidebar": "Close sidebar",
+ "openSidebar": "Open sidebar"
+ }
+ },
+ "Thread": {
+ "backToChat": "Go back to chat",
+ "chatCreatedOn": "This chat was created on"
+ }
+ },
+ "header": {
+ "chat": "Chat",
+ "readme": "Readme"
+ }
+ }
+ },
+ "hooks": {
+ "useLLMProviders": {
+ "failedToFetchProviders": "Failed to fetch providers:"
+ }
+ },
+ "pages": {
+ "Design": {},
+ "Env": {
+ "savedSuccessfully": "Saved successfully",
+ "requiredApiKeys": "Required API Keys",
+ "requiredApiKeysInfo": "To use this app, the following API keys are required. The keys are stored on your device's local storage."
+ },
+ "Page": {
+ "notPartOfProject": "You are not part of this project."
+ },
+ "ResumeButton": {
+ "resumeChat": "Resume Chat"
+ }
+ }
+}
\ No newline at end of file
diff --git a/.local/126/chromedriver b/.local/126/chromedriver
new file mode 100755
index 00000000..6f7de54a
Binary files /dev/null and b/.local/126/chromedriver differ
diff --git a/.local/llm.txt/10_file_download.md b/.local/llm.txt/10_file_download.md
new file mode 100644
index 00000000..eac0f5cb
--- /dev/null
+++ b/.local/llm.txt/10_file_download.md
@@ -0,0 +1,129 @@
+# Download Handling in Crawl4AI
+
+This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
+
+## Enabling Downloads
+
+To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
+
+```python
+from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
+
+async def main():
+ config = BrowserConfig(accept_downloads=True) # Enable downloads globally
+ async with AsyncWebCrawler(config=config) as crawler:
+ # ... your crawling logic ...
+
+asyncio.run(main())
+```
+
+Or, enable it for a specific crawl by using `CrawlerRunConfig`:
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def main():
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(accept_downloads=True)
+ result = await crawler.arun(url="https://example.com", config=config)
+ # ...
+```
+
+## Specifying Download Location
+
+Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+import os
+
+downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path
+os.makedirs(downloads_path, exist_ok=True)
+
+config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
+
+async def main():
+ async with AsyncWebCrawler(config=config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+ # ...
+```
+
+## Triggering Downloads
+
+Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+config = CrawlerRunConfig(
+ js_code="""
+ const downloadLink = document.querySelector('a[href$=".exe"]');
+ if (downloadLink) {
+ downloadLink.click();
+ }
+ """,
+ wait_for=5 # Wait 5 seconds for the download to start
+)
+
+result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
+```
+
+## Accessing Downloaded Files
+
+The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
+
+```python
+if result.downloaded_files:
+ print("Downloaded files:")
+ for file_path in result.downloaded_files:
+ print(f"- {file_path}")
+ file_size = os.path.getsize(file_path)
+ print(f"- File size: {file_size} bytes")
+else:
+ print("No files downloaded.")
+```
+
+## Example: Downloading Multiple Files
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+import os
+from pathlib import Path
+
+async def download_multiple_files(url: str, download_path: str):
+ config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
+ async with AsyncWebCrawler(config=config) as crawler:
+ run_config = CrawlerRunConfig(
+ js_code="""
+ const downloadLinks = document.querySelectorAll('a[download]');
+ for (const link of downloadLinks) {
+ link.click();
+ await new Promise(r => setTimeout(r, 2000)); // Delay between clicks
+ }
+ """,
+ wait_for=10 # Wait for all downloads to start
+ )
+ result = await crawler.arun(url=url, config=run_config)
+
+ if result.downloaded_files:
+ print("Downloaded files:")
+ for file in result.downloaded_files:
+ print(f"- {file}")
+ else:
+ print("No files downloaded.")
+
+# Usage
+download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
+os.makedirs(download_path, exist_ok=True)
+
+asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
+```
+
+## Important Considerations
+
+- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
+- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
+- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
+- **Security:** Scan downloaded files for potential security threats before use.
+
+This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
\ No newline at end of file
diff --git a/.local/llm.txt/10_file_download.q.md b/.local/llm.txt/10_file_download.q.md
new file mode 100644
index 00000000..72be37a1
--- /dev/null
+++ b/.local/llm.txt/10_file_download.q.md
@@ -0,0 +1,63 @@
+### Hypothetical Questions
+
+1. **Enabling Downloads**
+ - *"How do I configure Crawl4AI to allow file downloads during a crawl?"*
+ - *"Where in my code should I set `accept_downloads=True` to enable downloads?"*
+
+2. **Specifying the Download Location**
+ - *"How can I choose a custom directory for storing downloaded files?"*
+ - *"What is the default download directory if I don’t specify one?"*
+
+3. **Triggering Downloads from Pages**
+ - *"How do I simulate a click on a download link or button to initiate file downloads?"*
+ - *"Can I use JavaScript injection (`js_code`) to trigger downloads from the webpage elements?"*
+ - *"What does `wait_for` do, and how do I use it to ensure the download starts before proceeding?"*
+
+4. **Accessing Downloaded Files**
+ - *"Where can I find the paths to the files that I’ve downloaded?"*
+ - *"How do I check if any files were downloaded after my crawl completes?"*
+
+5. **Multiple Downloads**
+ - *"How do I handle scenarios where multiple files need to be downloaded sequentially?"*
+ - *"Can I introduce delays between file downloads to prevent server overload?"*
+
+6. **Error Handling and Reliability**
+ - *"What if the files I expect to download don’t appear or the links are broken?"*
+ - *"How can I handle incorrect paths, nonexistent directories, or failed downloads gracefully?"*
+
+7. **Timing and Performance**
+ - *"When should I use `wait_for` and how do I choose an appropriate delay?"*
+ - *"Can I start the download and continue processing other tasks concurrently?"*
+
+8. **Security Considerations**
+ - *"What precautions should I take with downloaded files?"*
+ - *"How can I ensure that downloaded files are safe before processing them further?"*
+
+9. **Integration with Other Crawl4AI Features**
+ - *"Can I combine file downloading with other extraction strategies or LLM-based processes?"*
+ - *"How do I manage downloads when running multiple parallel crawls?"*
+
+### Topics Discussed in the File
+
+- **Enabling Downloads in Crawl4AI**:
+ Configure the crawler through `BrowserConfig` or `CrawlerRunConfig` to allow file downloads.
+
+- **Download Locations**:
+ Specify a custom `downloads_path` or rely on the default directory (`~/.crawl4ai/downloads`).
+
+- **Triggering File Downloads**:
+ Use JavaScript code injection (`js_code`) to simulate user interactions (e.g., clicking a download link). Employ `wait_for` to allow time for downloads to initiate.
+
+- **Accessing Downloaded Files**:
+ After the crawl, `result.downloaded_files` provides a list of paths to the downloaded files. Use these paths to verify file sizes or further process the files.
+
+- **Handling Multiple Files**:
+ Loop through downloadable elements on the page, introduce delays, and wait for downloads to complete before proceeding.
+
+- **Error and Timing Considerations**:
+ Manage potential errors when downloads fail or timing issues arise. Adjust `wait_for` and error handling logic to ensure stable and reliable file retrievals.
+
+- **Security Precautions**:
+ Always verify the integrity and safety of downloaded files before using them in your application.
+
+In summary, the file explains how to set up, initiate, and manage file downloads within the Crawl4AI framework, including specifying directories, triggering downloads programmatically, handling multiple files, and accessing downloaded results. It also covers timing, error handling, and security best practices.
\ No newline at end of file
diff --git a/.local/llm.txt/11_page_interaction.md b/.local/llm.txt/11_page_interaction.md
new file mode 100644
index 00000000..2a60ae85
--- /dev/null
+++ b/.local/llm.txt/11_page_interaction.md
@@ -0,0 +1,190 @@
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
+
+## JavaScript Execution
+
+### Basic Execution
+
+```python
+from crawl4ai.async_configs import CrawlerRunConfig
+
+# Single JavaScript command
+config = CrawlerRunConfig(
+ js_code="window.scrollTo(0, document.body.scrollHeight);"
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+# Multiple commands
+js_commands = [
+ "window.scrollTo(0, document.body.scrollHeight);",
+ "document.querySelector('.load-more').click();",
+ "document.querySelector('#consent-button').click();"
+]
+config = CrawlerRunConfig(js_code=js_commands)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Wait Conditions
+
+### CSS-Based Waiting
+
+Wait for elements to appear:
+
+```python
+config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content'
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### JavaScript-Based Waiting
+
+Wait for custom conditions:
+
+```python
+# Wait for number of elements
+wait_condition = """() => {
+ return document.querySelectorAll('.item').length > 10;
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
+result = await crawler.arun(url="https://example.com", config=config)
+
+# Wait for dynamic content to load
+wait_for_content = """() => {
+ const content = document.querySelector('.content');
+ return content && content.innerText.length > 100;
+}"""
+
+config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}")
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Handling Dynamic Content
+
+### Load More Content
+
+Handle infinite scroll or load more buttons:
+
+```python
+config = CrawlerRunConfig(
+ js_code=[
+ "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
+ "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more
+ ],
+ wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Form Interaction
+
+Handle forms and inputs:
+
+```python
+js_form_interaction = """
+ document.querySelector('#search').value = 'search term'; // Fill form fields
+ document.querySelector('form').submit(); // Submit form
+"""
+
+config = CrawlerRunConfig(
+ js_code=js_form_interaction,
+ wait_for="css:.results" # Wait for results to load
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Timing Control
+
+### Delays and Timeouts
+
+Control timing of interactions:
+
+```python
+config = CrawlerRunConfig(
+ page_timeout=60000, # Page load timeout (ms)
+ delay_before_return_html=2.0 # Wait before capturing content
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
+
+### Complex Interactions Example
+
+Here's an example of handling a dynamic page with multiple interactions:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+async def crawl_dynamic_content():
+ async with AsyncWebCrawler() as crawler:
+ # Initial page load
+ config = CrawlerRunConfig(
+ js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent
+ wait_for="css:.main-content"
+ )
+ result = await crawler.arun(url="https://example.com", config=config)
+
+ # Load more content
+ session_id = "dynamic_session" # Keep session for multiple interactions
+
+ for page in range(3): # Load 3 pages of content
+ config = CrawlerRunConfig(
+ session_id=session_id,
+ js_code=[
+ "window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
+ "window.previousCount = document.querySelectorAll('.item').length;", # Store item count
+ "document.querySelector('.load-more')?.click();" # Click load more
+ ],
+ wait_for="""() => {
+ const currentCount = document.querySelectorAll('.item').length;
+ return currentCount > window.previousCount;
+ }""",
+ js_only=(page > 0) # Execute JS without reloading page for subsequent interactions
+ )
+ result = await crawler.arun(url="https://example.com", config=config)
+ print(f"Page {page + 1} items:", len(result.cleaned_html))
+
+ # Clean up session
+ await crawler.crawler_strategy.kill_session(session_id)
+```
+
+### Using with Extraction Strategies
+
+Combine page interaction with structured extraction:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai.async_configs import CrawlerRunConfig
+
+# Pattern-based extraction after interaction
+schema = {
+ "name": "Dynamic Items",
+ "baseSelector": ".item",
+ "fields": [
+ {"name": "title", "selector": "h2", "type": "text"},
+ {"name": "description", "selector": ".desc", "type": "text"}
+ ]
+}
+
+config = CrawlerRunConfig(
+ js_code="window.scrollTo(0, document.body.scrollHeight);",
+ wait_for="css:.item:nth-child(10)", # Wait for 10 items
+ extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+result = await crawler.arun(url="https://example.com", config=config)
+
+# Or use LLM to analyze dynamic content
+class ContentAnalysis(BaseModel):
+ topics: List[str]
+ summary: str
+
+config = CrawlerRunConfig(
+ js_code="document.querySelector('.show-more').click();",
+ wait_for="css:.full-content",
+ extraction_strategy=LLMExtractionStrategy(
+ provider="ollama/nemotron",
+ schema=ContentAnalysis.schema(),
+ instruction="Analyze the full content"
+ )
+)
+result = await crawler.arun(url="https://example.com", config=config)
+```
diff --git a/.local/llm.txt/11_page_interaction.q.md b/.local/llm.txt/11_page_interaction.q.md
new file mode 100644
index 00000000..e469947f
--- /dev/null
+++ b/.local/llm.txt/11_page_interaction.q.md
@@ -0,0 +1,64 @@
+Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed.
+
+### Hypothetical Questions
+
+1. **JavaScript Execution Basics**
+ - *"How do I inject a single JavaScript command into the page using Crawl4AI?"*
+ - *"Can I run multiple JavaScript commands sequentially before extracting content?"*
+
+2. **Waiting for Conditions**
+ - *"How can I wait for a particular CSS element to appear before extracting data?"*
+ - *"Is there a way to wait for a custom JavaScript condition, like a minimum number of items to load?"*
+
+3. **Handling Dynamic Content**
+ - *"How do I deal with infinite scrolling or 'Load More' buttons to continuously fetch new data?"*
+ - *"Can I simulate user interactions (clicking buttons, scrolling) to reveal more content?"*
+
+4. **Form Interactions**
+ - *"How can I fill out and submit a form on a webpage using JavaScript injection?"*
+ - *"What if I need to handle multiple form fields or a multi-step submission process?"*
+
+5. **Timing Control and Delays**
+ - *"How can I set a page load timeout or introduce a delay before extracting the final HTML?"*
+ - *"When should I adjust `delay_before_return_html` to ensure the page is fully rendered?"*
+
+6. **Complex Interactions**
+ - *"How do I chain multiple interactions, like accepting cookies, scrolling, and then clicking 'Load More' several times?"*
+ - *"Can I maintain a session to continue interacting with the page across multiple steps?"*
+
+7. **Integration with Extraction Strategies**
+ - *"How do I combine JavaScript-based interactions with a structured extraction strategy like `JsonCssExtractionStrategy`?"*
+ - *"Is it possible to use LLM-based extraction after dynamically revealing more content?"*
+
+8. **Troubleshooting Interactions**
+ - *"What if my JavaScript code fails or the element I want to interact with isn’t available?"*
+ - *"How can I verify that the dynamic content I triggered actually loaded before extraction?"*
+
+9. **Performance and Reliability**
+ - *"Do I need to consider timeouts and backoffs when dealing with heavily dynamic pages?"*
+ - *"How can I ensure that my JS-based interactions do not slow down the extraction process unnecessarily?"*
+
+### Topics Discussed in the File
+
+- **JavaScript Execution**:
+ Injecting single or multiple JS commands into the page to manipulate scrolling, clicks, or form submissions.
+
+- **Waiting Mechanisms**:
+ Using `wait_for` with CSS selectors (`"css:.some-element"`) or custom JavaScript conditions (`"js:() => {...}"`) to ensure the page is in the desired state before extraction.
+
+- **Dynamic Content Handling**:
+ Techniques for infinite scrolling, load more buttons, and other elements that reveal additional data after user-like interactions.
+
+- **Form Interaction**:
+ Filling out form fields, submitting forms, and waiting for results to appear.
+
+- **Timing Control**:
+ Setting page timeouts, introducing delays before returning HTML, and ensuring stable and complete extractions.
+
+- **Complex Interactions**:
+ Combining multiple steps (cookie acceptance, infinite scroll, load more clicks) and maintaining sessions across multiple steps for fully dynamic pages.
+
+- **Integration with Extraction Strategies**:
+ Applying pattern-based (CSS/JSON) or LLM-based extraction after performing required interactions to reveal the content of interest.
+
+In summary, the file provides detailed guidance on interacting with dynamic pages in Crawl4AI. It shows how to run JavaScript commands, wait for certain conditions, handle infinite scroll or complex user interactions, and integrate these techniques with content extraction strategies.
\ No newline at end of file
diff --git a/.local/llm.txt/12_prefix_based_input.md b/.local/llm.txt/12_prefix_based_input.md
new file mode 100644
index 00000000..f9155cbc
--- /dev/null
+++ b/.local/llm.txt/12_prefix_based_input.md
@@ -0,0 +1,158 @@
+# Prefix-Based Input Handling in Crawl4AI
+
+This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
+
+## Crawling a Web URL
+
+To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_web():
+ config = CrawlerRunConfig(bypass_cache=True)
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
+ if result.success:
+ print("Markdown Content:")
+ print(result.markdown)
+ else:
+ print(f"Failed to crawl: {result.error_message}")
+
+asyncio.run(crawl_web())
+```
+
+## Crawling a Local HTML File
+
+To crawl a local HTML file, prefix the file path with `file://`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_local_file():
+ local_file_path = "/path/to/apple.html" # Replace with your file path
+ file_url = f"file://{local_file_path}"
+ config = CrawlerRunConfig(bypass_cache=True)
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url=file_url, config=config)
+ if result.success:
+ print("Markdown Content from Local File:")
+ print(result.markdown)
+ else:
+ print(f"Failed to crawl local file: {result.error_message}")
+
+asyncio.run(crawl_local_file())
+```
+
+## Crawling Raw HTML Content
+
+To crawl raw HTML content, prefix the HTML string with `raw:`.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def crawl_raw_html():
+ raw_html = "
Hello, World!
"
+ raw_html_url = f"raw:{raw_html}"
+ config = CrawlerRunConfig(bypass_cache=True)
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url=raw_html_url, config=config)
+ if result.success:
+ print("Markdown Content from Raw HTML:")
+ print(result.markdown)
+ else:
+ print(f"Failed to crawl raw HTML: {result.error_message}")
+
+asyncio.run(crawl_raw_html())
+```
+
+---
+
+## Complete Example
+
+Below is a comprehensive script that:
+
+1. Crawls the Wikipedia page for "Apple."
+2. Saves the HTML content to a local file (`apple.html`).
+3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
+4. Crawls the raw HTML content from the saved file and verifies consistency.
+
+```python
+import os
+import sys
+import asyncio
+from pathlib import Path
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+
+async def main():
+ wikipedia_url = "https://en.wikipedia.org/wiki/apple"
+ script_dir = Path(__file__).parent
+ html_file_path = script_dir / "apple.html"
+
+ async with AsyncWebCrawler() as crawler:
+ # Step 1: Crawl the Web URL
+ print("\n=== Step 1: Crawling the Wikipedia URL ===")
+ web_config = CrawlerRunConfig(bypass_cache=True)
+ result = await crawler.arun(url=wikipedia_url, config=web_config)
+
+ if not result.success:
+ print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
+ return
+
+ with open(html_file_path, 'w', encoding='utf-8') as f:
+ f.write(result.html)
+ web_crawl_length = len(result.markdown)
+ print(f"Length of markdown from web crawl: {web_crawl_length}\n")
+
+ # Step 2: Crawl from the Local HTML File
+ print("=== Step 2: Crawling from the Local HTML File ===")
+ file_url = f"file://{html_file_path.resolve()}"
+ file_config = CrawlerRunConfig(bypass_cache=True)
+ local_result = await crawler.arun(url=file_url, config=file_config)
+
+ if not local_result.success:
+ print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
+ return
+
+ local_crawl_length = len(local_result.markdown)
+ assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
+ print("✅ Markdown length matches between web and local file crawl.\n")
+
+ # Step 3: Crawl Using Raw HTML Content
+ print("=== Step 3: Crawling Using Raw HTML Content ===")
+ with open(html_file_path, 'r', encoding='utf-8') as f:
+ raw_html_content = f.read()
+ raw_html_url = f"raw:{raw_html_content}"
+ raw_config = CrawlerRunConfig(bypass_cache=True)
+ raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
+
+ if not raw_result.success:
+ print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
+ return
+
+ raw_crawl_length = len(raw_result.markdown)
+ assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
+ print("✅ Markdown length matches between web and raw HTML crawl.\n")
+
+ print("All tests passed successfully!")
+ if html_file_path.exists():
+ os.remove(html_file_path)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+---
+
+## Conclusion
+
+With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
\ No newline at end of file
diff --git a/.local/llm.txt/12_prefix_based_input.q.md b/.local/llm.txt/12_prefix_based_input.q.md
new file mode 100644
index 00000000..2e6d4f03
--- /dev/null
+++ b/.local/llm.txt/12_prefix_based_input.q.md
@@ -0,0 +1,56 @@
+### Hypothetical Questions
+
+1. **Basic Usage**
+ - *"How can I crawl a regular website URL using Crawl4AI?"*
+ - *"What configuration object do I need to pass to `arun` for basic crawling scenarios?"*
+
+2. **Local HTML Files**
+ - *"How do I crawl an HTML file stored locally on my machine?"*
+ - *"What prefix should I use when specifying a local file path to `arun`?"*
+
+3. **Raw HTML Strings**
+ - *"Is it possible to crawl a raw HTML string without saving it to a file first?"*
+ - *"How do I prefix a raw HTML string so that Crawl4AI treats it like HTML content?"*
+
+4. **Verifying Results**
+ - *"Can I compare the extracted Markdown content from a live page with that of a locally saved or raw version to ensure they match?"*
+ - *"How do I handle errors or check if the crawl was successful?"*
+
+5. **Use Cases**
+ - *"When would I want to use `file://` vs. `raw:` URLs?"*
+ - *"Can I reuse the same code structure for various input types (web URL, file, raw HTML)?"*
+
+6. **Caching and Configuration**
+ - *"What does `bypass_cache=True` do and when should I use it?"*
+ - *"Is there a simpler way to configure crawling options uniformly across web URLs, local files, and raw HTML?"*
+
+7. **Practical Scenarios**
+ - *"How can I integrate file-based crawling into a pipeline that starts from a live page, saves the HTML, and then crawls that local file for consistency checks?"*
+ - *"Does Crawl4AI’s prefix-based handling allow me to pre-process raw HTML (e.g., downloaded from another source) without hosting it on a local server?"*
+
+### Topics Discussed in the File
+
+- **Prefix-Based Input Handling**:
+ Introducing the concept of using `http://` or `https://` for web URLs, `file://` for local files, and `raw:` for direct HTML strings. This unified approach allows seamless handling of different content sources within Crawl4AI.
+
+- **Crawling a Web URL**:
+ Demonstrating how to crawl a live web page (like a Wikipedia article) using `AsyncWebCrawler` and `CrawlerRunConfig`.
+
+- **Crawling a Local HTML File**:
+ Showing how to convert a local file path to a `file://` URL and use `arun` to process it, ensuring that previously saved HTML can be re-crawled for verification or offline analysis.
+
+- **Crawling Raw HTML Content**:
+ Explaining how to directly pass an HTML string prefixed with `raw:` to `arun`, enabling quick tests or processing of HTML code obtained from other sources without saving it to disk.
+
+- **Consistency and Verification**:
+ Providing a comprehensive example that:
+ 1. Crawls a live Wikipedia page.
+ 2. Saves the HTML to a file.
+ 3. Re-crawls the local file.
+ 4. Re-crawls the content as a raw HTML string.
+ 5. Verifies that the Markdown extracted remains consistent across all three methods.
+
+- **Integration with `CrawlerRunConfig`**:
+ Showing how to use `CrawlerRunConfig` to disable caching (`bypass_cache=True`) and ensure fresh results for each test run.
+
+In summary, the file highlights how to use Crawl4AI’s prefix-based handling to effortlessly switch between crawling live web pages, local HTML files, and raw HTML strings. It also demonstrates a detailed workflow for verifying consistency and correctness across various input methods.
\ No newline at end of file
diff --git a/.local/llm.txt/13_hooks_auth.md b/.local/llm.txt/13_hooks_auth.md
new file mode 100644
index 00000000..a8cd77b7
--- /dev/null
+++ b/.local/llm.txt/13_hooks_auth.md
@@ -0,0 +1,119 @@
+# Hooks & Auth for AsyncWebCrawler
+
+Crawl4AI's `AsyncWebCrawler` allows you to customize the behavior of the web crawler using hooks. Hooks are asynchronous functions called at specific points in the crawling process, allowing you to modify the crawler's behavior or perform additional actions. This updated documentation demonstrates how to use hooks, including the new `on_page_context_created` hook, and ensures compatibility with `BrowserConfig` and `CrawlerRunConfig`.
+
+In this example, we'll:
+
+1. Configure the browser and set up authentication when it's created.
+2. Apply custom routing and initial actions when the page context is created.
+3. Add custom headers before navigating to the URL.
+4. Log the current URL after navigation.
+5. Perform actions after JavaScript execution.
+6. Log the length of the HTML before returning it.
+
+## Hook Definitions
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+from playwright.async_api import Page, Browser, BrowserContext
+
+def log_routing(route):
+ # Example: block loading images
+ if route.request.resource_type == "image":
+ print(f"[HOOK] Blocking image request: {route.request.url}")
+ asyncio.create_task(route.abort())
+ else:
+ asyncio.create_task(route.continue_())
+
+async def on_browser_created(browser: Browser, **kwargs):
+ print("[HOOK] on_browser_created")
+ # Example: Set browser viewport size and log in
+ context = await browser.new_context(viewport={"width": 1920, "height": 1080})
+ page = await context.new_page()
+ await page.goto("https://example.com/login")
+ await page.fill("input[name='username']", "testuser")
+ await page.fill("input[name='password']", "password123")
+ await page.click("button[type='submit']")
+ await page.wait_for_selector("#welcome")
+ await context.add_cookies([{"name": "auth_token", "value": "abc123", "url": "https://example.com"}])
+ await page.close()
+ await context.close()
+
+async def on_page_context_created(context: BrowserContext, page: Page, **kwargs):
+ print("[HOOK] on_page_context_created")
+ await context.route("**", log_routing)
+
+async def before_goto(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] before_goto")
+ await page.set_extra_http_headers({"X-Test-Header": "test"})
+
+async def after_goto(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] after_goto")
+ print(f"Current URL: {page.url}")
+
+async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
+ print("[HOOK] on_execution_started")
+ await page.evaluate("console.log('Custom JS executed')")
+
+async def before_return_html(page: Page, context: BrowserContext, html: str, **kwargs):
+ print("[HOOK] before_return_html")
+ print(f"HTML length: {len(html)}")
+ return page
+```
+
+## Using the Hooks with AsyncWebCrawler
+
+```python
+async def main():
+ print("\n🔗 Using Crawler Hooks: Customize AsyncWebCrawler with hooks!")
+
+ # Configure browser and crawler settings
+ browser_config = BrowserConfig(
+ headless=True,
+ viewport_width=1920,
+ viewport_height=1080
+ )
+
+ crawler_run_config = CrawlerRunConfig(
+ js_code="window.scrollTo(0, document.body.scrollHeight);",
+ wait_for="footer"
+ )
+
+ # Initialize crawler
+ async with AsyncWebCrawler(browser_config=browser_config) as crawler:
+ crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
+ crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created)
+ crawler.crawler_strategy.set_hook("before_goto", before_goto)
+ crawler.crawler_strategy.set_hook("after_goto", after_goto)
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+ crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
+
+ # Run the crawler
+ result = await crawler.arun(url="https://example.com", config=crawler_run_config)
+
+ print("\n📦 Crawler Hooks Result:")
+ print(result)
+
+asyncio.run(main())
+```
+
+## Explanation of Hooks
+
+- **`on_browser_created`**: Called when the browser is created. Use this to configure the browser or handle authentication (e.g., logging in and setting cookies).
+- **`on_page_context_created`**: Called when a new page context is created. Use this to apply routing, block resources, or inject custom logic before navigating to the URL.
+- **`before_goto`**: Called before navigating to the URL. Use this to add custom headers or perform other pre-navigation actions.
+- **`after_goto`**: Called after navigation. Use this to verify content or log the URL.
+- **`on_execution_started`**: Called after executing custom JavaScript. Use this to perform additional actions.
+- **`before_return_html`**: Called before returning the HTML content. Use this to log details or preprocess the content.
+
+## Additional Customizations
+
+- **Resource Management**: Use `on_page_context_created` to block or modify requests (e.g., block images, fonts, or third-party scripts).
+- **Dynamic Headers**: Use `before_goto` to add or modify headers dynamically based on the URL.
+- **Authentication**: Use `on_browser_created` to handle login processes and set authentication cookies or tokens.
+- **Content Analysis**: Use `before_return_html` to analyze or modify the extracted HTML content.
+
+These hooks provide powerful customization options for tailoring the crawling process to your needs.
+
diff --git a/.local/llm.txt/13_hooks_auth.q.md b/.local/llm.txt/13_hooks_auth.q.md
new file mode 100644
index 00000000..266a5278
--- /dev/null
+++ b/.local/llm.txt/13_hooks_auth.q.md
@@ -0,0 +1,58 @@
+Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed.
+
+### Hypothetical Questions
+
+1. **General Hook Usage**
+ - *"What are hooks in Crawl4AI, and how do they help customize the crawling process?"*
+ - *"Which stages of the crawling lifecycle can I attach hooks to?"*
+
+2. **Specific Hooks**
+ - *"What does the `on_browser_created` hook allow me to do?"*
+ - *"How can I use the `on_page_context_created` hook to modify requests before navigation?"*
+ - *"When should I use `before_goto` and `after_goto` hooks?"*
+ - *"How does `on_execution_started` help with custom JavaScript execution?"*
+ - *"What kind of preprocessing can I do in `before_return_html`?"*
+
+3. **Authentication and Customization**
+ - *"How can I perform authentication (like logging in) before actual crawling begins?"*
+ - *"Can I set cookies, headers, or modify requests using hooks?"*
+
+4. **Error Handling and Debugging**
+ - *"If my hooks fail or raise errors, how is that handled during the crawling process?"*
+ - *"How can I use hooks to troubleshoot issues, like blocking image requests or logging console messages?"*
+
+5. **Complex Scenarios**
+ - *"Can I combine multiple hooks to handle complex workflows like login, script execution, and dynamic content blocking?"*
+ - *"Is it possible to add conditional logic in hooks to treat certain URLs differently?"*
+
+6. **Performance and Reliability**
+ - *"Do these hooks run asynchronously, and how does that affect the crawler’s performance?"*
+ - *"Can I cancel requests or actions via hooks to improve efficiency?"*
+
+7. **Integration with `BrowserConfig` and `CrawlerRunConfig`**
+ - *"How do I use `BrowserConfig` and `CrawlerRunConfig` in tandem with hooks?"*
+ - *"Does setting hooks require changes to the configuration objects or can I apply them at runtime?"*
+
+### Topics Discussed in the File
+
+- **Hooks in `AsyncWebCrawler`**:
+ Hooks are asynchronous callback functions triggered at key points in the crawling lifecycle. They allow advanced customization, such as modifying browser/page contexts, injecting scripts, or altering network requests.
+
+- **Hook Types and Purposes**:
+ - **`on_browser_created`**: Initialize browser state, handle authentication (login), set cookies.
+ - **`on_page_context_created`**: Set up request routing, block resources, or modify requests before navigation.
+ - **`before_goto`**: Add or modify HTTP headers, prepare the page before actually navigating to the target URL.
+ - **`after_goto`**: Verify the current URL, log details, or ensure that page navigation succeeded.
+ - **`on_execution_started`**: Perform actions right after JS execution, like logging console output or checking state.
+ - **`before_return_html`**: Analyze, log, or preprocess the extracted HTML before it’s returned.
+
+- **Practical Examples**:
+ Demonstrations of handling authentication via `on_browser_created`, blocking images using `on_page_context_created` with a custom routing function, adding HTTP headers in `before_goto`, and logging content details in `before_return_html`.
+
+- **Integration with Configuration Objects**:
+ Using `BrowserConfig` for initial browser settings and `CrawlerRunConfig` for specifying JavaScript code, wait conditions, and more, then combining them with hooks for a fully customizable crawling workflow.
+
+- **Asynchronous and Flexible**:
+ Hooks are async, fitting seamlessly into the event-driven model of crawling. They can abort requests, continue them, or conditionally modify behavior based on URL patterns.
+
+In summary, this file explains how to use hooks in Crawl4AI’s `AsyncWebCrawler` to customize nearly every aspect of the crawling process. By attaching hooks at various lifecycle stages, developers can implement authentication routines, block certain types of requests, tweak headers, run custom JS, and analyze the final HTML—all while maintaining control and flexibility.
\ No newline at end of file
diff --git a/.local/llm.txt/14_proxy_security.md b/.local/llm.txt/14_proxy_security.md
new file mode 100644
index 00000000..8989777b
--- /dev/null
+++ b/.local/llm.txt/14_proxy_security.md
@@ -0,0 +1,95 @@
+# Proxy & Security
+
+Configure proxy settings and enhance security features in Crawl4AI for reliable data extraction.
+
+## Basic Proxy Setup
+
+Simple proxy configuration with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+# Using proxy URL
+browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+
+# Using SOCKS proxy
+browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Authenticated Proxy
+
+Use an authenticated proxy with `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+proxy_config = {
+ "server": "http://proxy.example.com:8080",
+ "username": "user",
+ "password": "pass"
+}
+
+browser_config = BrowserConfig(proxy_config=proxy_config)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Rotating Proxies
+
+Example using a proxy rotation service and updating `BrowserConfig` dynamically:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+async def get_next_proxy():
+ # Your proxy rotation logic here
+ return {"server": "http://next.proxy.com:8080"}
+
+browser_config = BrowserConfig()
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ # Update proxy for each request
+ for url in urls:
+ proxy = await get_next_proxy()
+ browser_config.proxy_config = proxy
+ result = await crawler.arun(url=url, config=browser_config)
+```
+
+## Custom Headers
+
+Add security-related headers via `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig
+
+headers = {
+ "X-Forwarded-For": "203.0.113.195",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Cache-Control": "no-cache",
+ "Pragma": "no-cache"
+}
+
+browser_config = BrowserConfig(headers=headers)
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com")
+```
+
+## Combining with Magic Mode
+
+For maximum protection, combine proxy with Magic Mode via `CrawlerRunConfig` and `BrowserConfig`:
+
+```python
+from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
+
+browser_config = BrowserConfig(
+ proxy="http://proxy.example.com:8080",
+ headers={"Accept-Language": "en-US"}
+)
+crawler_config = CrawlerRunConfig(magic=True) # Enable all anti-detection features
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=crawler_config)
+```
diff --git a/.local/llm.txt/14_proxy_security.q.md b/.local/llm.txt/14_proxy_security.q.md
new file mode 100644
index 00000000..c7c8bc70
--- /dev/null
+++ b/.local/llm.txt/14_proxy_security.q.md
@@ -0,0 +1,53 @@
+### Hypothetical Questions
+
+1. **Basic Proxy Configuration**
+ - *"How do I set a basic HTTP proxy for the crawler?"*
+ - *"Can I use a SOCKS proxy instead of an HTTP proxy?"*
+
+2. **Authenticated Proxies**
+ - *"How do I provide a username and password for an authenticated proxy server?"*
+ - *"What is the `proxy_config` dictionary, and how do I use it?"*
+
+3. **Rotating Proxies**
+ - *"How can I dynamically change the proxy server for each request?"*
+ - *"What patterns or logic can I implement to rotate proxies from a pool?"*
+
+4. **Custom Headers for Security and Anonymity**
+ - *"How do I set custom HTTP headers in `BrowserConfig` to appear more human-like or meet security policies?"*
+ - *"Can I add headers like `X-Forwarded-For`, `Accept-Language`, or `Cache-Control`?"*
+
+5. **Combining Proxies with Magic Mode**
+ - *"What is Magic Mode, and how does it help with anti-detection features?"*
+ - *"Can I use Magic Mode in combination with proxies and custom headers for better anonymity?"*
+
+6. **Troubleshooting and Edge Cases**
+ - *"What if my authenticated proxy doesn’t accept credentials?"*
+ - *"How do I handle errors when switching proxies mid-crawl?"*
+
+7. **Performance and Reliability**
+ - *"Does using a proxy slow down the crawling process?"*
+ - *"How do I ensure stable and fast connections when rotating proxies frequently?"*
+
+8. **Integration with Other Crawl4AI Features**
+ - *"Can I use proxy configurations with hooks, caching, or LLM extraction strategies?"*
+ - *"How do I integrate proxy-based crawling into a larger pipeline that includes data extraction and content filtering?"*
+
+
+### Topics Discussed in the File
+
+- **Proxy Configuration**:
+ Shows how to set an HTTP or SOCKS proxy in `BrowserConfig` for the crawler, enabling you to route traffic through a specific server.
+
+- **Authenticated Proxies**:
+ Demonstrates how to provide username and password credentials to access proxy servers that require authentication.
+
+- **Rotating Proxies**:
+ Suggests a pattern for dynamically updating proxy settings before each request, allowing you to cycle through multiple proxies to avoid throttling or blocking.
+
+- **Custom Headers**:
+ Explains how to add custom HTTP headers in `BrowserConfig` for security, anonymity, or compliance with certain websites’ requirements.
+
+- **Integration with Magic Mode**:
+ Shows how to combine proxy usage, custom headers, and Magic Mode (`magic=True` in `CrawlerRunConfig`) to enhance anti-detection measures, making it harder for websites to detect automated crawlers.
+
+In summary, the file explains how to configure proxies (including authenticated proxies), rotate them dynamically, set custom headers for extra security and privacy, and combine these techniques with Magic Mode for robust anti-detection strategies in Crawl4AI.
\ No newline at end of file
diff --git a/.local/llm.txt/15_screenshot_and_pdf_export.md b/.local/llm.txt/15_screenshot_and_pdf_export.md
new file mode 100644
index 00000000..4dcc3ff1
--- /dev/null
+++ b/.local/llm.txt/15_screenshot_and_pdf_export.md
@@ -0,0 +1,58 @@
+# Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI
+
+When dealing with very long web pages, traditional full-page screenshots can be slow or fail entirely. For large pages (like extensive Wikipedia articles), generating a single massive screenshot often leads to delays, memory issues, or style differences.
+
+## **The New Approach:**
+We’ve introduced a new feature that effortlessly handles even the biggest pages by first exporting them as a PDF, then converting that PDF into a high-quality image. This approach leverages the browser’s built-in PDF rendering, making it both stable and efficient for very long content. You also have the option to directly save the PDF for your own usage—no need for multiple passes or complex stitching logic.
+
+## **Key Benefits:**
+- **Reliability:** The PDF export never times out and works regardless of page length.
+- **Versatility:** Get both the PDF and a screenshot in one crawl, without reloading or reprocessing.
+- **Performance:** Skips manual scrolling and stitching images, reducing complexity and runtime.
+
+## **Simple Example:**
+```python
+import os, sys
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+
+# Adjust paths as needed
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+async def main():
+ async with AsyncWebCrawler() as crawler:
+ # Request both PDF and screenshot
+ result = await crawler.arun(
+ url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
+ cache_mode=CacheMode.BYPASS,
+ pdf=True,
+ screenshot=True
+ )
+
+ if result.success:
+ # Save screenshot
+ if result.screenshot:
+ from base64 import b64decode
+ with open(os.path.join(__location__, "screenshot.png"), "wb") as f:
+ f.write(b64decode(result.screenshot))
+
+ # Save PDF
+ if result.pdf:
+ pdf_bytes = b64decode(result.pdf)
+ with open(os.path.join(__location__, "page.pdf"), "wb") as f:
+ f.write(pdf_bytes)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## **What Happens Under the Hood:**
+- Crawl4AI navigates to the target page.
+- If `pdf=True`, it exports the current page as a full PDF, capturing all of its content no matter the length.
+- If `screenshot=True`, and a PDF is already available, it directly converts the first page of that PDF to an image for you—no repeated loading or scrolling.
+- Finally, you get your PDF and/or screenshot ready to use.
+
+## **Conclusion:**
+With this feature, Crawl4AI becomes even more robust and versatile for large-scale content extraction. Whether you need a PDF snapshot or a quick screenshot, you now have a reliable solution for even the most extensive webpages.
\ No newline at end of file
diff --git a/.local/llm.txt/15_screenshot_and_pdf_export.q.md b/.local/llm.txt/15_screenshot_and_pdf_export.q.md
new file mode 100644
index 00000000..921ace9a
--- /dev/null
+++ b/.local/llm.txt/15_screenshot_and_pdf_export.q.md
@@ -0,0 +1,50 @@
+Below is a structured list of hypothetical questions derived from the file’s content, followed by a bullet-point summary of key topics discussed.
+
+### Hypothetical Questions
+
+1. **Motivation and Use Cases**
+ - *"Why should I use the PDF-based screenshot approach for very long web pages?"*
+ - *"What are the benefits of generating a PDF before converting it to an image?"*
+
+2. **Workflow and Technical Process**
+ - *"How does Crawl4AI generate a PDF and then convert it into a screenshot?"*
+ - *"Do I need to manually scroll or stitch images to capture large pages?"*
+
+3. **Practical Steps**
+ - *"What code do I need to write to request both a PDF and a screenshot in one crawl?"*
+ - *"How do I save the resulting PDF and screenshot to disk?"*
+
+4. **Performance and Reliability**
+ - *"Will this PDF-based method time out or fail for extremely long pages?"*
+ - *"Is this approach faster or more memory-efficient than traditional full-page screenshots?"*
+
+5. **Additional Features and Customization**
+ - *"Can I save only the PDF without generating a screenshot?"*
+ - *"If I have a PDF, can I easily convert it to multiple images or just the first page?"*
+
+6. **Integration with Other Crawl4AI Features**
+ - *"Can I combine PDF/screenshot generation with other Crawl4AI extraction strategies or hooks?"*
+ - *"Is caching or proxying affected by PDF or screenshot generation?"
+
+7. **Troubleshooting**
+ - *"What should I do if the screenshot or PDF does not appear in the result?"*
+ - *"How do I handle large PDF sizes or slow saves when dealing with massive pages?"*
+
+### Topics Discussed in the File
+
+- **New Approach to Large Page Screenshots**:
+ The document introduces a method to first export a page as a PDF using the browser’s built-in PDF rendering capabilities and then convert that PDF to an image if a screenshot is requested.
+
+- **Advantages Over Traditional Methods**:
+ This approach avoids timeouts, memory issues, and the complexity of stitching multiple images for extremely long pages. The PDF rendering is stable, reliable, and does not require the crawler to scroll through the entire page.
+
+- **One-Stop Solution**:
+ By enabling `pdf=True` and `screenshot=True`, you receive both the full-page PDF and a screenshot (converted from the PDF) in a single crawl. This reduces repetitive processes and complexity.
+
+- **How to Implement**:
+ Demonstrates code usage with `arun` to request both the PDF and screenshot, and how to save them to files. Explains that if a PDF is already generated, the screenshot is derived directly from it, simplifying the workflow.
+
+- **Integration and Efficiency**:
+ Compatible with other Crawl4AI features like caching and extraction strategies. Simplifies large-scale crawling pipelines needing both a textual representation (HTML extraction) and visual confirmations (PDF/screenshot).
+
+In summary, the file outlines a new feature for capturing full-page screenshots of massive web pages by first generating a stable, reliable PDF, then converting it into an image. This technique eliminates previous issues related to large content pages, ensuring smoother performance and simpler code maintenance.
\ No newline at end of file
diff --git a/.local/llm.txt/16_storage_state.md b/.local/llm.txt/16_storage_state.md
new file mode 100644
index 00000000..55858c4d
--- /dev/null
+++ b/.local/llm.txt/16_storage_state.md
@@ -0,0 +1,225 @@
+# Using `storage_state` to Pre-Load Cookies and LocalStorage
+
+Crawl4ai’s `AsyncWebCrawler` lets you preserve and reuse session data, including cookies and localStorage, across multiple runs. By providing a `storage_state`, you can start your crawls already “logged in” or with any other necessary session data—no need to repeat the login flow every time.
+
+## What is `storage_state`?
+
+`storage_state` can be:
+
+- A dictionary containing cookies and localStorage data.
+- A path to a JSON file that holds this information.
+
+When you pass `storage_state` to the crawler, it applies these cookies and localStorage entries before loading any pages. This means your crawler effectively starts in a known authenticated or pre-configured state.
+
+## Example Structure
+
+Here’s an example storage state:
+
+```json
+{
+ "cookies": [
+ {
+ "name": "session",
+ "value": "abcd1234",
+ "domain": "example.com",
+ "path": "/",
+ "expires": 1675363572.037711,
+ "httpOnly": false,
+ "secure": false,
+ "sameSite": "None"
+ }
+ ],
+ "origins": [
+ {
+ "origin": "https://example.com",
+ "localStorage": [
+ { "name": "token", "value": "my_auth_token" },
+ { "name": "refreshToken", "value": "my_refresh_token" }
+ ]
+ }
+ ]
+}
+```
+
+This JSON sets a `session` cookie and two localStorage entries (`token` and `refreshToken`) for `https://example.com`.
+
+---
+
+## Passing `storage_state` as a Dictionary
+
+You can directly provide the data as a dictionary:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ storage_dict = {
+ "cookies": [
+ {
+ "name": "session",
+ "value": "abcd1234",
+ "domain": "example.com",
+ "path": "/",
+ "expires": 1675363572.037711,
+ "httpOnly": False,
+ "secure": False,
+ "sameSite": "None"
+ }
+ ],
+ "origins": [
+ {
+ "origin": "https://example.com",
+ "localStorage": [
+ {"name": "token", "value": "my_auth_token"},
+ {"name": "refreshToken", "value": "my_refresh_token"}
+ ]
+ }
+ ]
+ }
+
+ async with AsyncWebCrawler(
+ headless=True,
+ storage_state=storage_dict
+ ) as crawler:
+ result = await crawler.arun(url='https://example.com/protected')
+ if result.success:
+ print("Crawl succeeded with pre-loaded session data!")
+ print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+---
+
+## Passing `storage_state` as a File
+
+If you prefer a file-based approach, save the JSON above to `mystate.json` and reference it:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ async with AsyncWebCrawler(
+ headless=True,
+ storage_state="mystate.json" # Uses a JSON file instead of a dictionary
+ ) as crawler:
+ result = await crawler.arun(url='https://example.com/protected')
+ if result.success:
+ print("Crawl succeeded with pre-loaded session data!")
+ print("Page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+---
+
+## Using `storage_state` to Avoid Repeated Logins (Sign In Once, Use Later)
+
+A common scenario is when you need to log in to a site (entering username/password, etc.) to access protected pages. Doing so every crawl is cumbersome. Instead, you can:
+
+1. Perform the login once in a hook.
+2. After login completes, export the resulting `storage_state` to a file.
+3. On subsequent runs, provide that `storage_state` to skip the login step.
+
+**Step-by-Step Example:**
+
+**First Run (Perform Login and Save State):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def on_browser_created_hook(browser):
+ # Access the default context and create a page
+ context = browser.contexts[0]
+ page = await context.new_page()
+
+ # Navigate to the login page
+ await page.goto("https://example.com/login", wait_until="domcontentloaded")
+
+ # Fill in credentials and submit
+ await page.fill("input[name='username']", "myuser")
+ await page.fill("input[name='password']", "mypassword")
+ await page.click("button[type='submit']")
+ await page.wait_for_load_state("networkidle")
+
+ # Now the site sets tokens in localStorage and cookies
+ # Export this state to a file so we can reuse it
+ await context.storage_state(path="my_storage_state.json")
+ await page.close()
+
+async def main():
+ # First run: perform login and export the storage_state
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ hooks={"on_browser_created": on_browser_created_hook},
+ use_persistent_context=True,
+ user_data_dir="./my_user_data"
+ ) as crawler:
+
+ # After on_browser_created_hook runs, we have storage_state saved to my_storage_state.json
+ result = await crawler.arun(
+ url='https://example.com/protected-page',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+ )
+ print("First run result success:", result.success)
+ if result.success:
+ print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**Second Run (Reuse Saved State, No Login Needed):**
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def main():
+ # Second run: no need to hook on_browser_created this time.
+ # Just provide the previously saved storage state.
+ async with AsyncWebCrawler(
+ headless=True,
+ verbose=True,
+ use_persistent_context=True,
+ user_data_dir="./my_user_data",
+ storage_state="my_storage_state.json" # Reuse previously exported state
+ ) as crawler:
+
+ # Now the crawler starts already logged in
+ result = await crawler.arun(
+ url='https://example.com/protected-page',
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True}),
+ )
+ print("Second run result success:", result.success)
+ if result.success:
+ print("Protected page HTML length:", len(result.html))
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+**What’s Happening Here?**
+
+- During the first run, the `on_browser_created_hook` logs into the site.
+- After logging in, the crawler exports the current session (cookies, localStorage, etc.) to `my_storage_state.json`.
+- On subsequent runs, passing `storage_state="my_storage_state.json"` starts the browser context with these tokens already in place, skipping the login steps.
+
+**Sign Out Scenario:**
+If the website allows you to sign out by clearing tokens or by navigating to a sign-out URL, you can also run a script that uses `on_browser_created_hook` or `arun` to simulate signing out, then export the resulting `storage_state` again. That would give you a baseline “logged out” state to start fresh from next time.
+
+---
+
+## Conclusion
+
+By using `storage_state`, you can skip repetitive actions, like logging in, and jump straight into crawling protected content. Whether you provide a file path or a dictionary, this powerful feature helps maintain state between crawls, simplifying your data extraction pipelines.
\ No newline at end of file
diff --git a/.local/llm.txt/16_storage_state_q.md b/.local/llm.txt/16_storage_state_q.md
new file mode 100644
index 00000000..241029bd
--- /dev/null
+++ b/.local/llm.txt/16_storage_state_q.md
@@ -0,0 +1,52 @@
+### Hypothetical Questions
+
+1. **Basic Concept of `storage_state`**
+ - *"What is `storage_state` and how does it help me maintain session data across crawls?"*
+ - *"Can I directly provide a dictionary of cookies and localStorage data, or do I need a file?"*
+
+2. **Cookies and LocalStorage Handling**
+ - *"How do I set cookies and localStorage items before starting my crawl?"*
+ - *"Can I specify multiple origins and different sets of localStorage keys per origin?"*
+
+3. **Using a `storage_state` File**
+ - *"How do I load session data from a JSON file?"*
+ - *"Can I export the current session state to a file and reuse it later?"*
+
+4. **Login and Authentication Scenarios**
+ - *"How can I use `storage_state` to skip the login process on subsequent runs?"*
+ - *"What’s the workflow for logging in once, exporting the session data, and then starting future crawls already logged in?"*
+
+5. **Updating or Changing the Session State**
+ - *"What if my session expires? Can I refresh the session and update the `storage_state` file?"*
+ - *"How can I revert to a 'logged out' state by clearing tokens or using a sign-out scenario?"*
+
+6. **Practical Use Cases**
+ - *"If I’m crawling a series of protected pages from the same site, how can `storage_state` speed up the process?"*
+ - *"Can I switch between multiple `storage_state` files for different accounts or different states (e.g., logged in vs. logged out)?"*
+
+7. **Performance and Reliability**
+ - *"Will using `storage_state` improve my crawl performance by reducing repeated actions?"*
+ - *"Are there any risks or complications when transferring `storage_state` between different environments?"*
+
+8. **Integration with Hooks and Configurations**
+ - *"How do I integrate `storage_state` with hooks for a one-time login flow?"*
+ - *"Can I still customize browser or page behavior with hooks if I start with a `storage_state`?"*
+
+### Topics Discussed in the File
+
+- **`storage_state` Overview**:
+ Explaining that `storage_state` is a mechanism to start crawls with preloaded cookies and localStorage data, eliminating the need to re-authenticate or re-set session data every time.
+
+- **Data Formats**:
+ You can provide `storage_state` as either a Python dictionary or a JSON file. The JSON structure includes cookies and localStorage entries associated with specific domains/origins.
+
+- **Practical Authentication Workflows**:
+ Demonstrating how to log in once (using a hook or manual interaction), then save the resulting `storage_state` to a file. Subsequent crawls can use this file to start already authenticated, greatly speeding up the process and simplifying pipelines.
+
+- **Updating or Changing State**:
+ The crawler can export the current session state to a file at any time. This allows reusing the same authenticated session, switching states, or returning to a baseline state (e.g., logged out) by applying a different `storage_state` file.
+
+- **Integration with Other Features**:
+ `storage_state` works seamlessly with `AsyncWebCrawler` and `CrawlerRunConfig`. You can still use hooks, JS code execution, and other Crawl4AI features alongside a preloaded session state.
+
+In summary, the file explains how to use `storage_state` to maintain and reuse session data (cookies, localStorage) across crawls in Crawl4AI, demonstrating how it streamlines workflows that require authentication or complex session setups.
\ No newline at end of file
diff --git a/.local/llm.txt/17_crawl_config.md b/.local/llm.txt/17_crawl_config.md
new file mode 100644
index 00000000..928ae1e2
--- /dev/null
+++ b/.local/llm.txt/17_crawl_config.md
@@ -0,0 +1,85 @@
+# CrawlerRunConfig Parameters Documentation
+
+## Content Processing Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `word_count_threshold` | int | 200 | Minimum word count threshold before processing content |
+| `extraction_strategy` | ExtractionStrategy | None | Strategy to extract structured data from crawled pages. When None, uses NoExtractionStrategy |
+| `chunking_strategy` | ChunkingStrategy | RegexChunking() | Strategy to chunk content before extraction |
+| `markdown_generator` | MarkdownGenerationStrategy | None | Strategy for generating markdown from extracted content |
+| `content_filter` | RelevantContentFilter | None | Optional filter to prune irrelevant content |
+| `only_text` | bool | False | If True, attempt to extract text-only content where applicable |
+| `css_selector` | str | None | CSS selector to extract a specific portion of the page |
+| `excluded_tags` | list[str] | [] | List of HTML tags to exclude from processing |
+| `keep_data_attributes` | bool | False | If True, retain `data-*` attributes while removing unwanted attributes |
+| `remove_forms` | bool | False | If True, remove all `