Compare commits
44 Commits
unclecode-
...
ssh-server
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1ac4fe023 | ||
|
|
a3c92141a1 | ||
|
|
3fd777dd6f | ||
|
|
d7200138a0 | ||
|
|
be37abe05a | ||
|
|
90ba51b52f | ||
|
|
11721eb0ce | ||
|
|
1222e456fb | ||
|
|
e8aaa57cb2 | ||
|
|
a661b3173d | ||
|
|
b781b6df96 | ||
|
|
14e537fdd3 | ||
|
|
64b33af0e0 | ||
|
|
1afcdb6996 | ||
|
|
ca625b3152 | ||
|
|
6521b4745f | ||
|
|
241862bfe6 | ||
|
|
f2491b6c1a | ||
|
|
886622cb1e | ||
|
|
13dc254438 | ||
|
|
096929153f | ||
|
|
7e95c38acb | ||
|
|
c697bf23e4 | ||
|
|
b951d34ed0 | ||
|
|
c8a10dc455 | ||
|
|
9e0ded8da0 | ||
|
|
48c27899b7 | ||
|
|
3c32b0abed | ||
|
|
a215ec08d6 | ||
|
|
5d3fef45f7 | ||
|
|
77df6db453 | ||
|
|
2124652327 | ||
|
|
255bde70c9 | ||
|
|
04808b5dc9 | ||
|
|
b3a150f3d1 | ||
|
|
de80a2da09 | ||
|
|
df4cda8322 | ||
|
|
7717a3b948 | ||
|
|
a4a6b2075f | ||
|
|
4010558885 | ||
|
|
b0cf5076da | ||
|
|
0d6e9e37ca | ||
|
|
9b0f71ba88 | ||
|
|
6ddccc144c |
22
.do/deploy.template.yaml
Normal file
22
.do/deploy.template.yaml
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
spec:
|
||||||
|
name: crawl4ai
|
||||||
|
services:
|
||||||
|
- name: crawl4ai
|
||||||
|
git:
|
||||||
|
branch: 0.3.74
|
||||||
|
repo_clone_url: https://github.com/unclecode/crawl4ai.git
|
||||||
|
dockerfile_path: Dockerfile
|
||||||
|
http_port: 11235
|
||||||
|
instance_count: 1
|
||||||
|
instance_size_slug: professional-xs
|
||||||
|
health_check:
|
||||||
|
http_path: /health
|
||||||
|
envs:
|
||||||
|
- key: INSTALL_TYPE
|
||||||
|
value: "basic"
|
||||||
|
- key: PYTHON_VERSION
|
||||||
|
value: "3.10"
|
||||||
|
- key: ENABLE_GPU
|
||||||
|
value: "false"
|
||||||
|
routes:
|
||||||
|
- path: /
|
||||||
@@ -1,2 +1 @@
|
|||||||
include requirements.txt
|
include requirements.txt
|
||||||
recursive-include crawl4ai/js_snippet *.js
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
|
# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
|
||||||
|
|
||||||
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.22"
|
__version__ = "0.4.2"
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from .config import (
|
|||||||
from .user_agent_generator import UserAgentGenerator
|
from .user_agent_generator import UserAgentGenerator
|
||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy
|
from .chunking_strategy import ChunkingStrategy
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@@ -270,7 +269,6 @@ class CrawlerRunConfig:
|
|||||||
word_count_threshold: int = MIN_WORD_THRESHOLD ,
|
word_count_threshold: int = MIN_WORD_THRESHOLD ,
|
||||||
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
|
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
|
||||||
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
|
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
|
||||||
markdown_generator : MarkdownGenerationStrategy = None,
|
|
||||||
content_filter=None,
|
content_filter=None,
|
||||||
cache_mode=None,
|
cache_mode=None,
|
||||||
session_id: str = None,
|
session_id: str = None,
|
||||||
@@ -311,7 +309,6 @@ class CrawlerRunConfig:
|
|||||||
self.word_count_threshold = word_count_threshold
|
self.word_count_threshold = word_count_threshold
|
||||||
self.extraction_strategy = extraction_strategy
|
self.extraction_strategy = extraction_strategy
|
||||||
self.chunking_strategy = chunking_strategy
|
self.chunking_strategy = chunking_strategy
|
||||||
self.markdown_generator = markdown_generator
|
|
||||||
self.content_filter = content_filter
|
self.content_filter = content_filter
|
||||||
self.cache_mode = cache_mode
|
self.cache_mode = cache_mode
|
||||||
self.session_id = session_id
|
self.session_id = session_id
|
||||||
@@ -367,7 +364,6 @@ class CrawlerRunConfig:
|
|||||||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||||
chunking_strategy=kwargs.get("chunking_strategy"),
|
chunking_strategy=kwargs.get("chunking_strategy"),
|
||||||
markdown_generator=kwargs.get("markdown_generator"),
|
|
||||||
content_filter=kwargs.get("content_filter"),
|
content_filter=kwargs.get("content_filter"),
|
||||||
cache_mode=kwargs.get("cache_mode"),
|
cache_mode=kwargs.get("cache_mode"),
|
||||||
session_id=kwargs.get("session_id"),
|
session_id=kwargs.get("session_id"),
|
||||||
|
|||||||
@@ -7,8 +7,7 @@ from pathlib import Path
|
|||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
# from contextlib import nullcontext, asynccontextmanager
|
from contextlib import nullcontext, asynccontextmanager
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from .models import CrawlResult, MarkdownGenerationResult
|
from .models import CrawlResult, MarkdownGenerationResult
|
||||||
from .async_database import async_db_manager
|
from .async_database import async_db_manager
|
||||||
from .chunking_strategy import *
|
from .chunking_strategy import *
|
||||||
@@ -16,7 +15,6 @@ from .content_filter_strategy import *
|
|||||||
from .extraction_strategy import *
|
from .extraction_strategy import *
|
||||||
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||||
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
|
|
||||||
from .content_scraping_strategy import WebScrapingStrategy
|
from .content_scraping_strategy import WebScrapingStrategy
|
||||||
from .async_logger import AsyncLogger
|
from .async_logger import AsyncLogger
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
@@ -135,11 +133,16 @@ class AsyncWebCrawler:
|
|||||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def nullcontext(self):
|
||||||
|
yield
|
||||||
|
|
||||||
async def awarmup(self):
|
async def awarmup(self):
|
||||||
"""Initialize the crawler with warm-up sequence."""
|
"""Initialize the crawler with warm-up sequence."""
|
||||||
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
||||||
self.ready = True
|
self.ready = True
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def nullcontext(self):
|
async def nullcontext(self):
|
||||||
"""异步空上下文管理器"""
|
"""异步空上下文管理器"""
|
||||||
@@ -320,8 +323,7 @@ class AsyncWebCrawler:
|
|||||||
config=config, # Pass the config object instead of individual parameters
|
config=config, # Pass the config object instead of individual parameters
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
pdf_data=pdf_data,
|
pdf_data=pdf_data,
|
||||||
verbose=config.verbose,
|
verbose=config.verbose
|
||||||
**kwargs
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Set response data
|
# Set response data
|
||||||
@@ -422,8 +424,7 @@ class AsyncWebCrawler:
|
|||||||
css_selector=config.css_selector,
|
css_selector=config.css_selector,
|
||||||
only_text=config.only_text,
|
only_text=config.only_text,
|
||||||
image_description_min_word_threshold=config.image_description_min_word_threshold,
|
image_description_min_word_threshold=config.image_description_min_word_threshold,
|
||||||
content_filter=config.content_filter,
|
content_filter=config.content_filter
|
||||||
**kwargs
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -434,29 +435,16 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Extract results
|
# Extract results
|
||||||
|
markdown_v2 = result.get("markdown_v2", None)
|
||||||
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
|
||||||
|
markdown = sanitize_input_encode(result.get("markdown", ""))
|
||||||
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
|
||||||
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
fit_html = sanitize_input_encode(result.get("fit_html", ""))
|
||||||
media = result.get("media", [])
|
media = result.get("media", [])
|
||||||
links = result.get("links", [])
|
links = result.get("links", [])
|
||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
|
|
||||||
# Markdown Generation
|
|
||||||
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
|
|
||||||
if not config.content_filter and not markdown_generator.content_filter:
|
|
||||||
markdown_generator.content_filter = PruningContentFilter()
|
|
||||||
|
|
||||||
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
|
|
||||||
cleaned_html=cleaned_html,
|
|
||||||
base_url=url,
|
|
||||||
# html2text_options=kwargs.get('html2text', {})
|
|
||||||
)
|
|
||||||
markdown_v2 = markdown_result
|
|
||||||
markdown = sanitize_input_encode(markdown_result.raw_markdown)
|
|
||||||
|
|
||||||
# Log processing completion
|
# Log processing completion
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="Processed {url:.50}... | Time: {timing}ms",
|
message="Processed {url:.50}... | Time: {timing}ms",
|
||||||
|
|||||||
@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
|
||||||
|
|
||||||
# markdown_content = self._generate_markdown_content(
|
markdown_content = self._generate_markdown_content(
|
||||||
# cleaned_html=cleaned_html,
|
cleaned_html=cleaned_html,
|
||||||
# html=html,
|
html=html,
|
||||||
# url=url,
|
url=url,
|
||||||
# success=success,
|
success=success,
|
||||||
# **kwargs
|
**kwargs
|
||||||
# )
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
# **markdown_content,
|
**markdown_content,
|
||||||
'cleaned_html': cleaned_html,
|
'cleaned_html': cleaned_html,
|
||||||
'success': success,
|
'success': success,
|
||||||
'media': media,
|
'media': media,
|
||||||
|
|||||||
@@ -1,21 +1,23 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.chunking_strategy import *
|
||||||
from crawl4ai.extraction_strategy import *
|
from crawl4ai.extraction_strategy import *
|
||||||
from crawl4ai.crawler_strategy import *
|
from crawl4ai.crawler_strategy import *
|
||||||
import asyncio
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
url = r'https://openai.com/api/pricing/'
|
url = r'https://openai.com/api/pricing/'
|
||||||
|
|
||||||
|
crawler = WebCrawler()
|
||||||
|
crawler.warmup()
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
class OpenAIModelFee(BaseModel):
|
class OpenAIModelFee(BaseModel):
|
||||||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||||||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||||||
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
result = crawler.run(
|
||||||
|
|
||||||
async def main():
|
|
||||||
# Use AsyncWebCrawler
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
url=url,
|
url=url,
|
||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
extraction_strategy= LLMExtractionStrategy(
|
extraction_strategy= LLMExtractionStrategy(
|
||||||
@@ -28,13 +30,12 @@ async def main():
|
|||||||
'One extracted model JSON format should look like this: '\
|
'One extracted model JSON format should look like this: '\
|
||||||
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
|
||||||
),
|
),
|
||||||
|
bypass_cache=True,
|
||||||
)
|
)
|
||||||
print("Success:", result.success)
|
|
||||||
model_fees = json.loads(result.extracted_content)
|
model_fees = json.loads(result.extracted_content)
|
||||||
|
|
||||||
print(len(model_fees))
|
print(len(model_fees))
|
||||||
|
|
||||||
with open(".data/data.json", "w", encoding="utf-8") as f:
|
with open(".data/data.json", "w", encoding="utf-8") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(result.extracted_content)
|
||||||
|
|
||||||
asyncio.run(main())
|
|
||||||
|
|||||||
@@ -142,7 +142,6 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
|
|||||||
crawler_config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
word_count_threshold=1,
|
word_count_threshold=1,
|
||||||
page_timeout = 80000,
|
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider=provider,
|
provider=provider,
|
||||||
api_token=api_token,
|
api_token=api_token,
|
||||||
@@ -498,21 +497,21 @@ async def main():
|
|||||||
|
|
||||||
# Advanced examples
|
# Advanced examples
|
||||||
# await extract_structured_data_using_css_extractor()
|
# await extract_structured_data_using_css_extractor()
|
||||||
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
# await crawl_dynamic_content_pages_method_1()
|
# await crawl_dynamic_content_pages_method_1()
|
||||||
# await crawl_dynamic_content_pages_method_2()
|
# await crawl_dynamic_content_pages_method_2()
|
||||||
|
|
||||||
# Browser comparisons
|
# Browser comparisons
|
||||||
# await crawl_custom_browser_type()
|
await crawl_custom_browser_type()
|
||||||
|
|
||||||
# Performance testing
|
# Performance testing
|
||||||
# await speed_comparison()
|
# await speed_comparison()
|
||||||
|
|
||||||
# Screenshot example
|
# Screenshot example
|
||||||
# await capture_and_save_screenshot(
|
await capture_and_save_screenshot(
|
||||||
# "https://www.example.com",
|
"https://www.example.com",
|
||||||
# os.path.join(__location__, "tmp/example_screenshot.jpg")
|
os.path.join(__location__, "tmp/example_screenshot.jpg")
|
||||||
# )
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
@@ -239,10 +239,8 @@ async def crawl_dynamic_content_pages_method_1():
|
|||||||
all_commits = []
|
all_commits = []
|
||||||
|
|
||||||
js_next_page = """
|
js_next_page = """
|
||||||
(() => {
|
|
||||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||||
if (button) button.click();
|
if (button) button.click();
|
||||||
})();
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
for page in range(3): # Crawl 3 pages
|
for page in range(3): # Crawl 3 pages
|
||||||
@@ -606,14 +604,14 @@ async def fit_markdown_remove_overlay():
|
|||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
# await simple_crawl()
|
await simple_crawl()
|
||||||
# await simple_example_with_running_js_code()
|
await simple_example_with_running_js_code()
|
||||||
# await simple_example_with_css_selector()
|
await simple_example_with_css_selector()
|
||||||
# # await use_proxy()
|
# await use_proxy()
|
||||||
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
|
||||||
# await extract_structured_data_using_css_extractor()
|
await extract_structured_data_using_css_extractor()
|
||||||
|
|
||||||
# LLM extraction examples
|
# LLM extraction examples
|
||||||
# await extract_structured_data_using_llm()
|
# await extract_structured_data_using_llm()
|
||||||
|
|||||||
231
docs/md/demo.md
Normal file
231
docs/md/demo.md
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
# Interactive Demo for Crowler
|
||||||
|
<div id="demo">
|
||||||
|
<form id="crawlForm" class="terminal-form">
|
||||||
|
<fieldset>
|
||||||
|
<legend>Enter URL and Options</legend>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="url">Enter URL:</label>
|
||||||
|
<input type="text" id="url" name="url" required>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="screenshot">Get Screenshot:</label>
|
||||||
|
<input type="checkbox" id="screenshot" name="screenshot">
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<button class="btn btn-default" type="submit">Submit</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
<div id="loading" class="loading-message">
|
||||||
|
<div class="terminal-alert terminal-alert-primary">Loading... Please wait.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section id="response" class="response-section">
|
||||||
|
<h2>Response</h2>
|
||||||
|
<div class="tabs">
|
||||||
|
<ul class="tab-list">
|
||||||
|
<li class="tab-item" onclick="showTab('markdown')">Markdown</li>
|
||||||
|
<li class="tab-item" onclick="showTab('cleanedHtml')">Cleaned HTML</li>
|
||||||
|
<li class="tab-item" onclick="showTab('media')">Media</li>
|
||||||
|
<li class="tab-item" onclick="showTab('extractedContent')">Extracted Content</li>
|
||||||
|
<li class="tab-item" onclick="showTab('screenshot')">Screenshot</li>
|
||||||
|
<li class="tab-item" onclick="showTab('pythonCode')">Python Code</li>
|
||||||
|
</ul>
|
||||||
|
<div class="tab-content" id="tab-markdown">
|
||||||
|
<header>
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('markdownContent')">Copy</button>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('markdownContent', 'markdown.md')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><code id="markdownContent" class="language-markdown hljs"></code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-content" id="tab-cleanedHtml" style="display: none;">
|
||||||
|
<header >
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('cleanedHtmlContent')">Copy</button>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('cleanedHtmlContent', 'cleaned.html')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><code id="cleanedHtmlContent" class="language-html hljs"></code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-content" id="tab-media" style="display: none;">
|
||||||
|
<header >
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('mediaContent')">Copy</button>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('mediaContent', 'media.json')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><code id="mediaContent" class="language-json hljs"></code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-content" id="tab-extractedContent" style="display: none;">
|
||||||
|
<header >
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('extractedContentContent')">Copy</button>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('extractedContentContent', 'extracted_content.json')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><code id="extractedContentContent" class="language-json hljs"></code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-content" id="tab-screenshot" style="display: none;">
|
||||||
|
<header >
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadImage('screenshotContent', 'screenshot.png')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><img id="screenshotContent" /></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="tab-content" id="tab-pythonCode" style="display: none;">
|
||||||
|
<header >
|
||||||
|
<div>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="copyToClipboard('pythonCode')">Copy</button>
|
||||||
|
<button class="btn btn-default btn-ghost btn-sm" onclick="downloadContent('pythonCode', 'example.py')">Download</button>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
<pre><code id="pythonCode" class="language-python hljs"></code></pre>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<div id="error" class="error-message" style="display: none; margin-top:1em;">
|
||||||
|
<div class="terminal-alert terminal-alert-error"></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
function showTab(tabId) {
|
||||||
|
const tabs = document.querySelectorAll('.tab-content');
|
||||||
|
tabs.forEach(tab => tab.style.display = 'none');
|
||||||
|
document.getElementById(`tab-${tabId}`).style.display = 'block';
|
||||||
|
}
|
||||||
|
|
||||||
|
function redo(codeBlock, codeText){
|
||||||
|
codeBlock.classList.remove('hljs');
|
||||||
|
codeBlock.removeAttribute('data-highlighted');
|
||||||
|
|
||||||
|
// Set new code and re-highlight
|
||||||
|
codeBlock.textContent = codeText;
|
||||||
|
hljs.highlightBlock(codeBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
function copyToClipboard(elementId) {
|
||||||
|
const content = document.getElementById(elementId).textContent;
|
||||||
|
navigator.clipboard.writeText(content).then(() => {
|
||||||
|
alert('Copied to clipboard');
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function downloadContent(elementId, filename) {
|
||||||
|
const content = document.getElementById(elementId).textContent;
|
||||||
|
const blob = new Blob([content], { type: 'text/plain' });
|
||||||
|
const url = window.URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.style.display = 'none';
|
||||||
|
a.href = url;
|
||||||
|
a.download = filename;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
window.URL.revokeObjectURL(url);
|
||||||
|
document.body.removeChild(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
function downloadImage(elementId, filename) {
|
||||||
|
const content = document.getElementById(elementId).src;
|
||||||
|
const a = document.createElement('a');
|
||||||
|
a.style.display = 'none';
|
||||||
|
a.href = content;
|
||||||
|
a.download = filename;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.getElementById('crawlForm').addEventListener('submit', function(event) {
|
||||||
|
event.preventDefault();
|
||||||
|
document.getElementById('loading').style.display = 'block';
|
||||||
|
document.getElementById('response').style.display = 'none';
|
||||||
|
|
||||||
|
const url = document.getElementById('url').value;
|
||||||
|
const screenshot = document.getElementById('screenshot').checked;
|
||||||
|
const data = {
|
||||||
|
urls: [url],
|
||||||
|
bypass_cache: false,
|
||||||
|
word_count_threshold: 5,
|
||||||
|
screenshot: screenshot
|
||||||
|
};
|
||||||
|
|
||||||
|
fetch('https://crawl4ai.com/crawl', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
},
|
||||||
|
body: JSON.stringify(data)
|
||||||
|
})
|
||||||
|
.then(response => {
|
||||||
|
if (!response.ok) {
|
||||||
|
if (response.status === 429) {
|
||||||
|
return response.json().then(err => {
|
||||||
|
throw Object.assign(new Error('Rate limit exceeded'), { status: 429, details: err });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
throw new Error('Network response was not ok');
|
||||||
|
}
|
||||||
|
return response.json();
|
||||||
|
})
|
||||||
|
.then(data => {
|
||||||
|
data = data.results[0]; // Only one URL is requested
|
||||||
|
document.getElementById('loading').style.display = 'none';
|
||||||
|
document.getElementById('response').style.display = 'block';
|
||||||
|
redo(document.getElementById('markdownContent'), data.markdown);
|
||||||
|
redo(document.getElementById('cleanedHtmlContent'), data.cleaned_html);
|
||||||
|
redo(document.getElementById('mediaContent'), JSON.stringify(data.media, null, 2));
|
||||||
|
redo(document.getElementById('extractedContentContent'), data.extracted_content);
|
||||||
|
if (screenshot) {
|
||||||
|
document.getElementById('screenshotContent').src = `data:image/png;base64,${data.screenshot}`;
|
||||||
|
}
|
||||||
|
const pythonCode = `
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
|
||||||
|
crawler = WebCrawler()
|
||||||
|
crawler.warmup()
|
||||||
|
|
||||||
|
result = crawler.run(
|
||||||
|
url='${url}',
|
||||||
|
screenshot=${screenshot}
|
||||||
|
)
|
||||||
|
print(result)
|
||||||
|
`;
|
||||||
|
redo(document.getElementById('pythonCode'), pythonCode);
|
||||||
|
document.getElementById('error').style.display = 'none';
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
document.getElementById('loading').style.display = 'none';
|
||||||
|
document.getElementById('error').style.display = 'block';
|
||||||
|
let errorMessage = 'An unexpected error occurred. Please try again later.';
|
||||||
|
|
||||||
|
if (error.status === 429) {
|
||||||
|
const details = error.details;
|
||||||
|
if (details.retry_after) {
|
||||||
|
errorMessage = `Rate limit exceeded. Please wait ${parseFloat(details.retry_after).toFixed(1)} seconds before trying again.`;
|
||||||
|
} else if (details.reset_at) {
|
||||||
|
const resetTime = new Date(details.reset_at);
|
||||||
|
const waitTime = Math.ceil((resetTime - new Date()) / 1000);
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again after ${waitTime} seconds.`;
|
||||||
|
} else {
|
||||||
|
errorMessage = `Rate limit exceeded. Please try again later.`;
|
||||||
|
}
|
||||||
|
} else if (error.message) {
|
||||||
|
errorMessage = error.message;
|
||||||
|
}
|
||||||
|
|
||||||
|
document.querySelector('#error .terminal-alert').textContent = errorMessage;
|
||||||
|
});
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
</div>
|
||||||
@@ -99,7 +99,7 @@ async def main():
|
|||||||
remove_overlay_elements=True,
|
remove_overlay_elements=True,
|
||||||
|
|
||||||
# Cache control
|
# Cache control
|
||||||
cache_mode=CacheMode.ENABLED # Use cache if available
|
cache_mode=CacheMode.ENABLE # Use cache if available
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
|
|||||||
158
main.py
158
main.py
@@ -380,97 +380,97 @@ def read_root():
|
|||||||
return {"message": "Crawl4AI API service is running"}
|
return {"message": "Crawl4AI API service is running"}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/crawl", dependencies=[Depends(verify_token)])
|
# @app.post("/crawl", dependencies=[Depends(verify_token)])
|
||||||
async def crawl(request: CrawlRequest) -> Dict[str, str]:
|
# async def crawl(request: CrawlRequest) -> Dict[str, str]:
|
||||||
task_id = await crawler_service.submit_task(request)
|
# task_id = await crawler_service.submit_task(request)
|
||||||
return {"task_id": task_id}
|
# return {"task_id": task_id}
|
||||||
|
|
||||||
@app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
|
# @app.get("/task/{task_id}", dependencies=[Depends(verify_token)])
|
||||||
async def get_task_status(task_id: str):
|
# async def get_task_status(task_id: str):
|
||||||
task_info = crawler_service.task_manager.get_task(task_id)
|
# task_info = crawler_service.task_manager.get_task(task_id)
|
||||||
if not task_info:
|
# if not task_info:
|
||||||
raise HTTPException(status_code=404, detail="Task not found")
|
# raise HTTPException(status_code=404, detail="Task not found")
|
||||||
|
|
||||||
response = {
|
# response = {
|
||||||
"status": task_info.status,
|
# "status": task_info.status,
|
||||||
"created_at": task_info.created_at,
|
# "created_at": task_info.created_at,
|
||||||
}
|
# }
|
||||||
|
|
||||||
if task_info.status == TaskStatus.COMPLETED:
|
# if task_info.status == TaskStatus.COMPLETED:
|
||||||
# Convert CrawlResult to dict for JSON response
|
# # Convert CrawlResult to dict for JSON response
|
||||||
if isinstance(task_info.result, list):
|
# if isinstance(task_info.result, list):
|
||||||
response["results"] = [result.dict() for result in task_info.result]
|
# response["results"] = [result.dict() for result in task_info.result]
|
||||||
else:
|
# else:
|
||||||
response["result"] = task_info.result.dict()
|
# response["result"] = task_info.result.dict()
|
||||||
elif task_info.status == TaskStatus.FAILED:
|
# elif task_info.status == TaskStatus.FAILED:
|
||||||
response["error"] = task_info.error
|
# response["error"] = task_info.error
|
||||||
|
|
||||||
return response
|
# return response
|
||||||
|
|
||||||
@app.post("/crawl_sync", dependencies=[Depends(verify_token)])
|
# @app.post("/crawl_sync", dependencies=[Depends(verify_token)])
|
||||||
async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
# async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
|
||||||
task_id = await crawler_service.submit_task(request)
|
# task_id = await crawler_service.submit_task(request)
|
||||||
|
|
||||||
# Wait up to 60 seconds for task completion
|
# # Wait up to 60 seconds for task completion
|
||||||
for _ in range(60):
|
# for _ in range(60):
|
||||||
task_info = crawler_service.task_manager.get_task(task_id)
|
# task_info = crawler_service.task_manager.get_task(task_id)
|
||||||
if not task_info:
|
# if not task_info:
|
||||||
raise HTTPException(status_code=404, detail="Task not found")
|
# raise HTTPException(status_code=404, detail="Task not found")
|
||||||
|
|
||||||
if task_info.status == TaskStatus.COMPLETED:
|
# if task_info.status == TaskStatus.COMPLETED:
|
||||||
# Return same format as /task/{task_id} endpoint
|
# # Return same format as /task/{task_id} endpoint
|
||||||
if isinstance(task_info.result, list):
|
# if isinstance(task_info.result, list):
|
||||||
return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
|
# return {"status": task_info.status, "results": [result.dict() for result in task_info.result]}
|
||||||
return {"status": task_info.status, "result": task_info.result.dict()}
|
# return {"status": task_info.status, "result": task_info.result.dict()}
|
||||||
|
|
||||||
if task_info.status == TaskStatus.FAILED:
|
# if task_info.status == TaskStatus.FAILED:
|
||||||
raise HTTPException(status_code=500, detail=task_info.error)
|
# raise HTTPException(status_code=500, detail=task_info.error)
|
||||||
|
|
||||||
await asyncio.sleep(1)
|
# await asyncio.sleep(1)
|
||||||
|
|
||||||
# If we get here, task didn't complete within timeout
|
# # If we get here, task didn't complete within timeout
|
||||||
raise HTTPException(status_code=408, detail="Task timed out")
|
# raise HTTPException(status_code=408, detail="Task timed out")
|
||||||
|
|
||||||
@app.post("/crawl_direct", dependencies=[Depends(verify_token)])
|
# @app.post("/crawl_direct", dependencies=[Depends(verify_token)])
|
||||||
async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
|
# async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
|
||||||
try:
|
# try:
|
||||||
crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
|
# crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
|
||||||
extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
|
# extraction_strategy = crawler_service._create_extraction_strategy(request.extraction_config)
|
||||||
|
|
||||||
try:
|
# try:
|
||||||
if isinstance(request.urls, list):
|
# if isinstance(request.urls, list):
|
||||||
results = await crawler.arun_many(
|
# results = await crawler.arun_many(
|
||||||
urls=[str(url) for url in request.urls],
|
# urls=[str(url) for url in request.urls],
|
||||||
extraction_strategy=extraction_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
js_code=request.js_code,
|
# js_code=request.js_code,
|
||||||
wait_for=request.wait_for,
|
# wait_for=request.wait_for,
|
||||||
css_selector=request.css_selector,
|
# css_selector=request.css_selector,
|
||||||
screenshot=request.screenshot,
|
# screenshot=request.screenshot,
|
||||||
magic=request.magic,
|
# magic=request.magic,
|
||||||
cache_mode=request.cache_mode,
|
# cache_mode=request.cache_mode,
|
||||||
session_id=request.session_id,
|
# session_id=request.session_id,
|
||||||
**request.extra,
|
# **request.extra,
|
||||||
)
|
# )
|
||||||
return {"results": [result.dict() for result in results]}
|
# return {"results": [result.dict() for result in results]}
|
||||||
else:
|
# else:
|
||||||
result = await crawler.arun(
|
# result = await crawler.arun(
|
||||||
url=str(request.urls),
|
# url=str(request.urls),
|
||||||
extraction_strategy=extraction_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
js_code=request.js_code,
|
# js_code=request.js_code,
|
||||||
wait_for=request.wait_for,
|
# wait_for=request.wait_for,
|
||||||
css_selector=request.css_selector,
|
# css_selector=request.css_selector,
|
||||||
screenshot=request.screenshot,
|
# screenshot=request.screenshot,
|
||||||
magic=request.magic,
|
# magic=request.magic,
|
||||||
cache_mode=request.cache_mode,
|
# cache_mode=request.cache_mode,
|
||||||
session_id=request.session_id,
|
# session_id=request.session_id,
|
||||||
**request.extra,
|
# **request.extra,
|
||||||
)
|
# )
|
||||||
return {"result": result.dict()}
|
# return {"result": result.dict()}
|
||||||
finally:
|
# finally:
|
||||||
await crawler_service.crawler_pool.release(crawler)
|
# await crawler_service.crawler_pool.release(crawler)
|
||||||
except Exception as e:
|
# except Exception as e:
|
||||||
logger.error(f"Error in direct crawl: {str(e)}")
|
# logger.error(f"Error in direct crawl: {str(e)}")
|
||||||
raise HTTPException(status_code=500, detail=str(e))
|
# raise HTTPException(status_code=500, detail=str(e))
|
||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health_check():
|
async def health_check():
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ docs_dir: docs/md_v2
|
|||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
- 'Installation': 'basic/installation.md'
|
- 'Installation': 'basic/installation.md'
|
||||||
- 'Docker Deplotment': 'basic/docker-deploymeny.md'
|
- 'Docker Deployment': 'basic/docker-deploymeny.md'
|
||||||
- 'Quick Start': 'basic/quickstart.md'
|
- 'Quick Start': 'basic/quickstart.md'
|
||||||
- Changelog & Blog:
|
- Changelog & Blog:
|
||||||
- 'Blog Home': 'blog/index.md'
|
- 'Blog Home': 'blog/index.md'
|
||||||
|
|||||||
3
setup.py
3
setup.py
@@ -57,9 +57,6 @@ setup(
|
|||||||
author_email="unclecode@kidocode.com",
|
author_email="unclecode@kidocode.com",
|
||||||
license="MIT",
|
license="MIT",
|
||||||
packages=find_packages(),
|
packages=find_packages(),
|
||||||
package_data={
|
|
||||||
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
|
|
||||||
},
|
|
||||||
install_requires=default_requirements
|
install_requires=default_requirements
|
||||||
+ ["playwright", "aiofiles"], # Added aiofiles
|
+ ["playwright", "aiofiles"], # Added aiofiles
|
||||||
extras_require={
|
extras_require={
|
||||||
|
|||||||
Reference in New Issue
Block a user